diff --git a/Examples/CoulombSum3d/coulomb_rtx500ada_full_search_space.t4.json b/Examples/CoulombSum3d/coulomb_rtx500ada_full_search_space.t4.json index 3f49061c..cb0d4727 100644 --- a/Examples/CoulombSum3d/coulomb_rtx500ada_full_search_space.t4.json +++ b/Examples/CoulombSum3d/coulomb_rtx500ada_full_search_space.t4.json @@ -5,20 +5,37 @@ "compute_api": "CUDA", "device": "NVIDIA RTX 500 Ada Generation Laptop GPU", "platform": "NVIDIA CUDA", - "timestamp": "2026-01-27 09:27:37 UTC", + "timestamp": "2026-03-13 09:42:10 UTC", "timeunit": "microseconds" }, "results": [ { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 256 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 44 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "1" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 1 }, "correctness": 1, "invalidity": "correct", @@ -26,61 +43,61 @@ { "name": "time", "unit": "", - "value": 4651.232 + "value": 4625.056 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 21.999069148936172 + "value": 10.413613167579596 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102324.0 + "value": 684.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1868068.0 + "value": 1868208.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.167714690042399 + "value": 1.61945582588573 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2158716.0 + "value": 65116.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100115.0 + "value": 2099676.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.719930504269065 + "value": 22.494046099611182 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.0482729233324062 + "value": 1.0541427733786057 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -110,13 +127,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 63.0898831843032 + "value": 60.270256988922235 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95217016189612 + "value": 99.94543870312687 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -128,7 +145,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 8606711808.0 + "value": 8589934592.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -152,7 +169,7 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", @@ -170,13 +187,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 342360064.0 + "value": 341311488.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.853397078956995 + "value": 39.991802159177155 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -188,13 +205,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.74771415475139 + "value": 45.00131163005578 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.060907330772839 + "value": 11.953473401733568 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -206,7 +223,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 57.07150258663136 + "value": 57.2191579633614 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -225,30 +242,47 @@ "time" ], "times": { - "compilation": 15894.075, - "data": 63084.019, - "framework": 274855.673, - "kernel_overhead": 61015.864, - "profiling_overhead": 53355.97, - "profiling_runs": 97399.82, + "compilation_time": 16550.897, + "data": 61128.287, + "framework": 269589.194, + "kernel_overhead": 60468.296, + "profiling_overhead": 51204.158, + "profiling_runs": 96788.453, "runtimes": [ - 4651.232 + 4625.056 ], - "search_algorithm": 21.935, - "validation": 18.527 + "search_algorithm": 25.784, + "validation": 13.724 }, - "timestamp": "2026-01-27 09:24:55 UTC" + "timestamp": "2026-03-13 09:39:30 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "1" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -256,61 +290,61 @@ { "name": "time", "unit": "", - "value": 3832.736 + "value": 2122.56 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.653704339834757 + "value": 23.048801316078777 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097640.0 + "value": 6604.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1869556.0 + "value": 1871936.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.2557753347641527 + "value": 2.9495507070514804 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2152522.0 + "value": 41726.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099338.0 + "value": 2100138.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.724279323889956 + "value": 19.910985735743168 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.0484961465334799 + "value": 0.9329052962190687 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -340,13 +374,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 71.17747964784758 + "value": 95.3901192598929 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96101380252213 + "value": 99.94629043595558 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -358,7 +392,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 8606711808.0 + "value": 6442450944.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -370,43 +404,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 587202560.0 + "value": 704643072.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 117440512.0 + "value": 142606336.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 234881024.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 50331648.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 342360064.0 + "value": 256901120.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.85674782392271 + "value": 50.483527003321306 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -418,13 +452,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.75328318879029 + "value": 79.65069865369911 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.062408359478633 + "value": 10.734176185752418 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -436,7 +470,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 57.07860953396677 + "value": 76.22919815387539 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -455,30 +489,47 @@ "time" ], "times": { - "compilation": 14591.79, - "data": 63808.032, - "framework": 252762.744, - "kernel_overhead": 52546.761, - "profiling_overhead": 53544.089, - "profiling_runs": 82863.862, + "compilation_time": 14436.949, + "data": 63774.84, + "framework": 273382.684, + "kernel_overhead": 66712.869, + "profiling_overhead": 53480.402, + "profiling_runs": 89414.573, "runtimes": [ - 3832.736 + 2122.56 ], - "search_algorithm": 15.249, - "validation": 13.217 + "search_algorithm": 30.007, + "validation": 16.354 }, - "timestamp": "2026-01-27 09:24:55 UTC" + "timestamp": "2026-03-13 09:39:30 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 39 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "1" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -486,61 +537,61 @@ { "name": "time", "unit": "", - "value": 4195.2 + "value": 1828.192 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 26.933837240860065 + "value": 27.00451696165192 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2099248.0 + "value": 2480.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838660.0 + "value": 1872368.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.28597843151674 + "value": 3.346337561497289 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2161221.0 + "value": 36972.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099596.0 + "value": 2103303.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.804108847162897 + "value": 11.290646780919694 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.0521705512978097 + "value": 0.5289034228289748 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -570,13 +621,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 63.229384176356774 + "value": 97.11650820752672 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9563813179575 + "value": 99.90403221337345 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -588,7 +639,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 8606711808.0 + "value": 5368709120.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -600,43 +651,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 587202560.0 + "value": 390070272.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 117440512.0 + "value": 71303168.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 234881024.0 + "value": 88080384.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 50331648.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 342360064.0 + "value": 197394432.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.99950810570688 + "value": 45.83808438201746 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -648,13 +699,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.91220029855307 + "value": 90.35289383517835 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.10524148671938 + "value": 6.264702599900061 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -666,7 +717,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 57.28128886477305 + "value": 66.44250720680364 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -685,30 +736,47 @@ "time" ], "times": { - "compilation": 13392.944, - "data": 61640.938, - "framework": 249183.528, - "kernel_overhead": 52751.502, - "profiling_overhead": 51337.493, - "profiling_runs": 83453.595, + "compilation_time": 14301.771, + "data": 64456.265, + "framework": 207942.981, + "kernel_overhead": 34402.855, + "profiling_overhead": 53730.376, + "profiling_runs": 55353.485, "runtimes": [ - 4195.2 + 1828.192 ], - "search_algorithm": 20.661, - "validation": 15.155 + "search_algorithm": 22.356, + "validation": 14.318 }, - "timestamp": "2026-01-27 09:24:55 UTC" + "timestamp": "2026-03-13 09:39:30 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "1" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -716,61 +784,61 @@ { "name": "time", "unit": "", - "value": 3863.712 + "value": 1786.944 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.469185280182394 + "value": 27.459425162486657 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2107232.0 + "value": 4780.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1840736.0 + "value": 1871740.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.2911661880046226 + "value": 3.43097102667654 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2169574.0 + "value": 36703.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2105187.0 + "value": 2100249.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.813756531737464 + "value": 5.79713181637969 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.0526402580716179 + "value": 0.2715836989664437 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -800,13 +868,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 70.89450355115572 + "value": 98.4168065338735 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97109238732202 + "value": 99.92458230027061 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -818,7 +886,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 8606711808.0 + "value": 4831838208.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -830,43 +898,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 587202560.0 + "value": 362807296.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 117440512.0 + "value": 69206016.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 234881024.0 + "value": 127926272.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 50331648.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 342360064.0 + "value": 173801472.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 40.012024154563306 + "value": 42.625018714518895 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -878,13 +946,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.92563794046464 + "value": 92.77053465047281 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.108863351140862 + "value": 3.397358446672589 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -896,7 +964,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 57.29843116293686 + "value": 60.066464073226456 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -915,30 +983,47 @@ "time" ], "times": { - "compilation": 12955.431, - "data": 63538.764, - "framework": 247475.775, - "kernel_overhead": 50322.479, - "profiling_overhead": 52939.861, - "profiling_runs": 80674.671, + "compilation_time": 13656.641, + "data": 64033.476, + "framework": 195497.96500000003, + "kernel_overhead": 28639.999, + "profiling_overhead": 53738.133, + "profiling_runs": 49086.357, "runtimes": [ - 3863.712 + 1786.944 ], - "search_algorithm": 23.26, - "validation": 15.311 + "search_algorithm": 23.138, + "validation": 15.619 }, - "timestamp": "2026-01-27 09:24:56 UTC" + "timestamp": "2026-03-13 09:39:30 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "1" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -946,61 +1031,61 @@ { "name": "time", "unit": "", - "value": 3792.416 + "value": 1750.304 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 26.870563365578676 + "value": 27.832814578977384 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2108580.0 + "value": 3456.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1845116.0 + "value": 1870828.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.300943957180106 + "value": 3.4932530263408057 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2171062.0 + "value": 33924.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102819.0 + "value": 2100111.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.801248795295464 + "value": 2.9954802664507256 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.0519954621566185 + "value": 0.14030270582811138 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -1030,13 +1115,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 73.03157707187836 + "value": 98.39189211473479 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95339735360602 + "value": 99.90434827634792 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -1048,7 +1133,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 8606711808.0 + "value": 4563402752.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -1060,43 +1145,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 587202560.0 + "value": 209715200.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 117440512.0 + "value": 67108864.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 234881024.0 + "value": 160432128.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 50331648.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 342360064.0 + "value": 163381248.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.99393992034953 + "value": 39.532478914694174 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -1108,13 +1193,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.90606713435846 + "value": 95.87168342968546 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.103588407307553 + "value": 1.9427123351230209 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -1126,7 +1211,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 57.27344651590261 + "value": 58.35282431031613 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -1145,30 +1230,47 @@ "time" ], "times": { - "compilation": 12710.457, - "data": 63425.718, - "framework": 251182.77000000002, - "kernel_overhead": 52215.42, - "profiling_overhead": 52969.994, - "profiling_runs": 82571.638, + "compilation_time": 13933.523, + "data": 65401.678, + "framework": 205428.967, + "kernel_overhead": 32372.173, + "profiling_overhead": 54710.918, + "profiling_runs": 52944.198, "runtimes": [ - 3792.416 + 1750.304 ], - "search_algorithm": 28.097, - "validation": 13.513 + "search_algorithm": 17.561, + "validation": 12.416 }, - "timestamp": "2026-01-27 09:24:56 UTC" + "timestamp": "2026-03-13 09:39:31 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -1176,61 +1278,61 @@ { "name": "time", "unit": "", - "value": 2103.232 + "value": 1741.76 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 48.9894403691434 + "value": 28.298179460782407 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104528.0 + "value": 240.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1870436.0 + "value": 1864524.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.716705659760554 + "value": 3.518859581819769 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2125531.0 + "value": 28772.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2104498.0 + "value": 2098991.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.445196457654134 + "value": 1.4943897250673268 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.9288396081155547 + "value": 0.07000424993249102 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -1260,13 +1362,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.36806905508905 + "value": 81.74260723095009 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95109735619351 + "value": 99.9257280145751 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -1278,7 +1380,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6459228160.0 + "value": 4429185024.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -1290,43 +1392,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 704643072.0 + "value": 138412032.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 142606336.0 + "value": 33554432.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 82837504.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 257949696.0 + "value": 150192128.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 50.41447144551186 + "value": 37.934593085022215 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -1338,13 +1440,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.29975957148999 + "value": 95.65017719349177 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.996646346827713 + "value": 1.1559286550483014 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -1356,7 +1458,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 76.20309939807616 + "value": 53.5183388160655 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -1375,30 +1477,47 @@ "time" ], "times": { - "compilation": 12696.228, - "data": 64060.867, - "framework": 259293.284, - "kernel_overhead": 59562.187, - "profiling_overhead": 53886.224, - "profiling_runs": 81784.006, + "compilation_time": 13579.687, + "data": 64021.437, + "framework": 182811.388, + "kernel_overhead": 22194.174, + "profiling_overhead": 53988.269, + "profiling_runs": 42607.508, "runtimes": [ - 2103.232 + 1741.76 ], - "search_algorithm": 20.282, - "validation": 10.726 + "search_algorithm": 20.045, + "validation": 13.461 }, - "timestamp": "2026-01-27 09:24:56 UTC" + "timestamp": "2026-03-13 09:39:31 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 19 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -1406,61 +1525,61 @@ { "name": "time", "unit": "", - "value": 2107.776 + "value": 5072.768 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 48.82462202259211 + "value": 9.588635739415896 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2101224.0 + "value": 6520.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1871116.0 + "value": 1870312.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.691820956355132 + "value": 1.2065750913640265 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2121053.0 + "value": 92934.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099430.0 + "value": 2101293.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.43997317090961 + "value": 7.954841736224723 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.9286236741791437 + "value": 0.3727848550085191 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -1490,13 +1609,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 93.92937884280917 + "value": 97.57424503773251 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95518132505144 + "value": 99.96876785872067 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -1508,7 +1627,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6459228160.0 + "value": 6979321856.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -1520,25 +1639,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 704643072.0 + "value": 8212447232.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 142606336.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 3321888768.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -1550,13 +1669,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 257949696.0 + "value": 715390976.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 50.400247107039256 + "value": 36.26513015383146 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -1568,13 +1687,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.27808491713806 + "value": 31.820912646454406 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.993640681868753 + "value": 4.288365180869833 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -1586,7 +1705,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 76.18232725366094 + "value": 84.80443029046585 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -1605,30 +1724,47 @@ "time" ], "times": { - "compilation": 13735.602, - "data": 64822.879, - "framework": 263696.87, - "kernel_overhead": 60967.435, - "profiling_overhead": 54726.219, - "profiling_runs": 83180.337, + "compilation_time": 12964.069, + "data": 65241.53, + "framework": 2033676.3309999998, + "kernel_overhead": 938504.769, + "profiling_overhead": 54388.263, + "profiling_runs": 975541.769, "runtimes": [ - 2107.776 + 5072.768 ], - "search_algorithm": 31.496, - "validation": 17.017 + "search_algorithm": 31.782, + "validation": 17.514 }, - "timestamp": "2026-01-27 09:24:56 UTC" + "timestamp": "2026-03-13 09:39:32 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 22 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -1636,61 +1772,61 @@ { "name": "time", "unit": "", - "value": 2172.736 + "value": 5694.816 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 48.7781876722745 + "value": 8.615016645062456 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102452.0 + "value": 412.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838888.0 + "value": 1871128.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.756236458540647 + "value": 1.1010689399689677 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2138777.0 + "value": 95824.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099865.0 + "value": 2099079.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.457812185325896 + "value": 3.6246960469967497 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.9295669406114107 + "value": 0.16986943637160654 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -1720,13 +1856,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.40942179179784 + "value": 98.73392674578345 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.94859812730616 + "value": 99.97171899129378 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -1738,7 +1874,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6459228160.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -1750,43 +1886,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 704643072.0 + "value": 10049552384.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 142606336.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 1392508928.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 257949696.0 + "value": 786956288.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 50.455131122317 + "value": 33.82778398688032 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -1798,13 +1934,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.36384009889296 + "value": 28.999251755001087 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.005532513713673 + "value": 2.0106903072315205 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -1816,7 +1952,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 76.26467935111755 + "value": 85.01577820318393 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -1835,30 +1971,47 @@ "time" ], "times": { - "compilation": 12737.195, - "data": 64895.124, - "framework": 261758.47000000003, - "kernel_overhead": 59990.023, - "profiling_overhead": 54818.368, - "profiling_runs": 82054.955, + "compilation_time": 14114.672, + "data": 64248.526, + "framework": 2610953.772, + "kernel_overhead": 1226808.237, + "profiling_overhead": 53472.917, + "profiling_runs": 1266424.092, "runtimes": [ - 2172.736 + 5694.816 ], - "search_algorithm": 19.429, - "validation": 12.901 + "search_algorithm": 21.164, + "validation": 11.626 }, - "timestamp": "2026-01-27 09:24:56 UTC" + "timestamp": "2026-03-13 09:39:33 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 32, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -1866,61 +2019,61 @@ { "name": "time", "unit": "", - "value": 2167.616 + "value": 8063.36 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 48.781884522791344 + "value": 6.283676341194658 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103588.0 + "value": 19524.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838800.0 + "value": 1936688.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.763029892442016 + "value": 48.767527188637146 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2139631.0 + "value": 973615.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099821.0 + "value": 138421553.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.461939577123957 + "value": 1.2712572009462428 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.9295705445359647 + "value": 0.059242074600756006 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -1938,25 +2091,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 94.61398937577567 + "value": 91.06120526459911 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.93817998675392 + "value": 99.02011953455342 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -1968,7 +2121,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6459228160.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -1980,43 +2133,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 704643072.0 + "value": 3982491648.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 142606336.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 2319450112.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 155189248.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 257949696.0 + "value": 395247616.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 50.45997383498428 + "value": 10.679570754287457 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -2028,13 +2181,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.37242117501978 + "value": 20.42140010748195 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.006722467629695 + "value": 11.058267929295647 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -2046,7 +2199,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 76.27297974201687 + "value": 30.06902552361049 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -2065,30 +2218,47 @@ "time" ], "times": { - "compilation": 13014.506, - "data": 65424.243, - "framework": 260001.507, - "kernel_overhead": 58639.622, - "profiling_overhead": 55330.292, - "profiling_runs": 80607.35, + "compilation_time": 14084.45, + "data": 60331.688, + "framework": 877465.015, + "kernel_overhead": 357516.736, + "profiling_overhead": 50138.773, + "profiling_runs": 409477.818, "runtimes": [ - 2167.616 + 8063.36 ], - "search_algorithm": 26.244, - "validation": 14.934 + "search_algorithm": 21.502, + "validation": 15.481 }, - "timestamp": "2026-01-27 09:24:57 UTC" + "timestamp": "2026-03-13 09:39:34 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -2096,61 +2266,61 @@ { "name": "time", "unit": "", - "value": 2162.432 + "value": 8382.56 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 48.516484815994446 + "value": 6.413510059969771 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2099936.0 + "value": 196.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838640.0 + "value": 2020308.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.728774893376334 + "value": 54.20263848269499 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2134255.0 + "value": 17168434.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099949.0 + "value": 138413785.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.358561709573273 + "value": 0.6356024314961998 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.9254018980227893 + "value": 0.02980169720762608 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -2168,25 +2338,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 96.12230270202133 + "value": 88.24275091013848 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97350521197525 + "value": 100.4407949437812 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -2198,7 +2368,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6459228160.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -2210,43 +2380,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 704643072.0 + "value": 3635412992.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 142606336.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 2252341248.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 77594624.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 257949696.0 + "value": 367034368.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 50.2173490313751 + "value": 9.153129560773474 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -2258,13 +2428,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 78.98855648188172 + "value": 20.25534077575423 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.953491230885941 + "value": 10.63702099820492 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -2276,7 +2446,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 75.90417004020563 + "value": 27.695636552078074 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -2295,30 +2465,47 @@ "time" ], "times": { - "compilation": 13959.155, - "data": 65241.577, - "framework": 263063.631, - "kernel_overhead": 60492.696, - "profiling_overhead": 54907.83, - "profiling_runs": 82421.528, + "compilation_time": 13809.213, + "data": 62159.044, + "framework": 853320.324, + "kernel_overhead": 343405.938, + "profiling_overhead": 52145.757, + "profiling_runs": 395609.585, "runtimes": [ - 2162.432 + 8382.56 ], - "search_algorithm": 34.132, - "validation": 17.361 + "search_algorithm": 20.819, + "validation": 15.164 }, - "timestamp": "2026-01-27 09:24:57 UTC" + "timestamp": "2026-03-13 09:39:34 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -2326,61 +2513,61 @@ { "name": "time", "unit": "", - "value": 1879.456 + "value": 9547.52 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 55.97537091585948 + "value": 6.32302599031768 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098312.0 + "value": 1852.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1866428.0 + "value": 2288520.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.527628811216685 + "value": 75.23392989899871 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2113782.0 + "value": 111496879.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099419.0 + "value": 138419087.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.957712805363945 + "value": 0.2773444689236649 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5271242376447481 + "value": 0.01282342037438627 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -2398,25 +2585,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 96.18329462963054 + "value": 91.88202686374977 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.92771171327496 + "value": 98.0279153563047 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -2428,7 +2615,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5385486336.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -2440,43 +2627,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 390070272.0 + "value": 3461873664.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 71303168.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 2218786816.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 88080384.0 + "value": 38797312.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 198443008.0 + "value": 352927744.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.84885140371172 + "value": 7.4387690247722835 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -2488,13 +2675,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.02761597920778 + "value": 17.86046680770238 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.593819529727132 + "value": 9.233285758132274 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -2506,7 +2693,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 66.55497740910504 + "value": 23.4824681896418 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -2525,30 +2712,47 @@ "time" ], "times": { - "compilation": 13554.856, - "data": 65502.107, - "framework": 212599.62900000002, - "kernel_overhead": 35658.869, - "profiling_overhead": 55076.96, - "profiling_runs": 56361.693, + "compilation_time": 13067.866, + "data": 61728.469, + "framework": 832414.729, + "kernel_overhead": 330272.982, + "profiling_overhead": 51607.952, + "profiling_runs": 388805.326, "runtimes": [ - 1879.456 + 9547.52 ], - "search_algorithm": 20.045, - "validation": 10.621 + "search_algorithm": 29.393, + "validation": 14.131 }, - "timestamp": "2026-01-27 09:24:57 UTC" + "timestamp": "2026-03-13 09:39:35 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 23 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -2556,61 +2760,61 @@ { "name": "time", "unit": "", - "value": 1958.304 + "value": 3432.96 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 55.13961868919974 + "value": 14.25429101964633 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104024.0 + "value": 15540.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1871816.0 + "value": 1880860.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.564895183710831 + "value": 1.8256710163815433 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2121666.0 + "value": 78447.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2105224.0 + "value": 2134319.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.953228628462792 + "value": 6.059491718883135 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5270744387555845 + "value": 0.2839478151148024 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -2640,13 +2844,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 94.71476902332331 + "value": 98.31439879116267 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.92725777330782 + "value": 99.95440736089881 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -2658,7 +2862,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5385486336.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -2670,25 +2874,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 390070272.0 + "value": 4945084416.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 71303168.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 88080384.0 + "value": 1665138688.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -2700,13 +2904,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 198443008.0 + "value": 484966400.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.84422382894784 + "value": 41.38078969006535 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -2718,13 +2922,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.01951975074405 + "value": 48.48253157857601 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.593226544243949 + "value": 3.3615817793739224 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -2736,7 +2940,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 66.54905379093708 + "value": 87.59117074591536 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -2755,30 +2959,47 @@ "time" ], "times": { - "compilation": 13345.404, - "data": 64587.961, - "framework": 210240.054, - "kernel_overhead": 34936.139, - "profiling_overhead": 54571.254, - "profiling_runs": 56144.7, + "compilation_time": 13061.264, + "data": 61445.276, + "framework": 1333557.745, + "kernel_overhead": 595954.893, + "profiling_overhead": 51376.48, + "profiling_runs": 624781.096, "runtimes": [ - 1958.304 + 3432.96 ], - "search_algorithm": 25.103, - "validation": 14.787 + "search_algorithm": 20.507, + "validation": 14.662 }, - "timestamp": "2026-01-27 09:24:57 UTC" + "timestamp": "2026-03-13 09:39:35 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -2786,61 +3007,61 @@ { "name": "time", "unit": "", - "value": 1852.928 + "value": 4067.52 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 55.004300714924035 + "value": 12.065163768618515 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2100648.0 + "value": 484.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838540.0 + "value": 1869960.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.498999091802597 + "value": 1.5249789663890714 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2132959.0 + "value": 70175.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100783.0 + "value": 2099095.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.967754333792875 + "value": 2.5397549873578837 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5275382546267193 + "value": 0.1190199681954779 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -2870,13 +3091,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 96.36563380035646 + "value": 98.94555506384253 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.90405419459563 + "value": 99.96648658778236 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -2888,7 +3109,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5385486336.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -2900,43 +3121,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 390070272.0 + "value": 5463080960.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 71303168.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 88080384.0 + "value": 1369440256.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 198443008.0 + "value": 596246528.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.895244480476805 + "value": 40.96665806808 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -2948,13 +3169,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.119661491339 + "value": 40.6391020271737 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.600561144385179 + "value": 1.4882483652529432 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -2966,7 +3187,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 66.62302852685391 + "value": 90.2677601097571 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -2985,30 +3206,47 @@ "time" ], "times": { - "compilation": 13080.658, - "data": 62659.078, - "framework": 208103.469, - "kernel_overhead": 35861.114, - "profiling_overhead": 52838.755, - "profiling_runs": 56744.522, + "compilation_time": 13329.261, + "data": 60877.281, + "framework": 1894399.858, + "kernel_overhead": 875950.306, + "profiling_overhead": 49957.203, + "profiling_runs": 907615.068, "runtimes": [ - 1852.928 + 4067.52 ], - "search_algorithm": 23.384, - "validation": 14.469 + "search_algorithm": 24.168, + "validation": 15.35 }, - "timestamp": "2026-01-27 09:24:57 UTC" + "timestamp": "2026-03-13 09:39:36 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -3016,61 +3254,61 @@ { "name": "time", "unit": "", - "value": 1879.552 + "value": 8092.928 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 55.59903369628129 + "value": 6.47219830393929 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098452.0 + "value": 15272.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1836216.0 + "value": 2019612.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.557774548162584 + "value": 56.83382157061392 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2130329.0 + "value": 24467384.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098994.0 + "value": 138417132.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.96720317008775 + "value": 0.6332243162338091 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5275674719533896 + "value": 0.029514130826861145 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -3088,25 +3326,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.8812382577246 + "value": 90.41484223138787 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.90073830143925 + "value": 99.97061909768424 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -3118,7 +3356,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5385486336.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -3130,43 +3368,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 390070272.0 + "value": 1958739968.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 71303168.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 1178599424.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 88080384.0 + "value": 143654912.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 198443008.0 + "value": 266371072.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.90081261364647 + "value": 8.950291984088693 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -3178,13 +3416,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.12764411046199 + "value": 20.154234807178394 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.601145808871728 + "value": 5.545366852463392 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -3196,7 +3434,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 66.62898727481817 + "value": 19.99954520813405 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -3215,30 +3453,47 @@ "time" ], "times": { - "compilation": 13623.498, - "data": 64967.993, - "framework": 212079.33000000002, - "kernel_overhead": 35536.83, - "profiling_overhead": 55220.838, - "profiling_runs": 56353.669, + "compilation_time": 13141.641, + "data": 65367.029, + "framework": 552439.6240000001, + "kernel_overhead": 189927.064, + "profiling_overhead": 55300.722, + "profiling_runs": 241844.809, "runtimes": [ - 1879.552 + 8092.928 ], - "search_algorithm": 18.887, - "validation": 13.817 + "search_algorithm": 28.953, + "validation": 15.538 }, - "timestamp": "2026-01-27 09:24:57 UTC" + "timestamp": "2026-03-13 09:39:37 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -3246,61 +3501,61 @@ { "name": "time", "unit": "", - "value": 1845.92 + "value": 9518.048 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 56.435641860891906 + "value": 6.280588496260721 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098096.0 + "value": 14060.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838132.0 + "value": 2269156.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.61764650653196 + "value": 81.65403883653019 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2128610.0 + "value": 132976851.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099964.0 + "value": 138424680.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.074206203458965 + "value": 0.27393017661975316 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5323049958608608 + "value": 0.012718086514203912 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -3318,25 +3573,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 96.45273082318771 + "value": 98.99613878402005 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.91363560727518 + "value": 98.07449821000799 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -3348,7 +3603,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5385486336.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -3360,43 +3615,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 390070272.0 + "value": 1817706496.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 71303168.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 1145044992.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 88080384.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 198443008.0 + "value": 252264448.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 46.30755804990525 + "value": 7.303431874321705 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -3408,13 +3663,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.92524633041373 + "value": 17.705344173035794 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.659563940215849 + "value": 4.7267563118199805 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -3426,7 +3681,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 67.21862133958506 + "value": 16.639042150996122 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -3445,30 +3700,47 @@ "time" ], "times": { - "compilation": 13469.896, - "data": 65682.979, - "framework": 211507.90999999997, - "kernel_overhead": 34893.947, - "profiling_overhead": 55309.538, - "profiling_runs": 55621.446, + "compilation_time": 13326.167, + "data": 63050.243, + "framework": 525875.102, + "kernel_overhead": 175836.173, + "profiling_overhead": 52441.992, + "profiling_runs": 234546.694, "runtimes": [ - 1845.92 + 9518.048 ], - "search_algorithm": 17.68, - "validation": 13.859 + "search_algorithm": 21.905, + "validation": 13.095 }, - "timestamp": "2026-01-27 09:24:57 UTC" + "timestamp": "2026-03-13 09:39:37 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -3476,61 +3748,61 @@ { "name": "time", "unit": "", - "value": 1885.824 + "value": 2550.816 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 56.8551029104505 + "value": 19.09064788842138 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103616.0 + "value": 8884.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1867108.0 + "value": 1872788.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.632503515432086 + "value": 2.4097307002662167 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2121424.0 + "value": 54866.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100691.0 + "value": 2100911.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.494533659595679 + "value": 4.066936693371175 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2704471456634533 + "value": 0.19053323644379205 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -3560,13 +3832,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.64154138156974 + "value": 98.70618661578031 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96969595219547 + "value": 99.93617420526064 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -3578,7 +3850,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4848615424.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -3590,25 +3862,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 362807296.0 + "value": 2776629248.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 69206016.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 127926272.0 + "value": 1642070016.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -3620,13 +3892,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 174850048.0 + "value": 378208256.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.60732786594795 + "value": 45.24846643398096 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -3638,13 +3910,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 92.34060866198398 + "value": 65.07688054234542 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.742319589328452 + "value": 2.383186543298782 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -3656,7 +3928,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 60.14880698121129 + "value": 91.68994484156153 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -3675,30 +3947,47 @@ "time" ], "times": { - "compilation": 13056.195, - "data": 64450.364, - "framework": 198242.983, - "kernel_overhead": 29379.549, - "profiling_overhead": 54400.019, - "profiling_runs": 50013.051, + "compilation_time": 13802.381, + "data": 61211.306, + "framework": 1059974.401, + "kernel_overhead": 461636.205, + "profiling_overhead": 51025.393, + "profiling_runs": 486101.497, "runtimes": [ - 1885.824 + 2550.816 ], - "search_algorithm": 24.614, - "validation": 14.52 + "search_algorithm": 24.389, + "validation": 15.484 }, - "timestamp": "2026-01-27 09:24:58 UTC" + "timestamp": "2026-03-13 09:39:38 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -3706,61 +3995,61 @@ { "name": "time", "unit": "", - "value": 1794.976 + "value": 3267.616 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 58.10244422572178 + "value": 15.143757853861525 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098060.0 + "value": 8180.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1868896.0 + "value": 1873736.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.730148509681079 + "value": 1.9125955228706149 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2115904.0 + "value": 62697.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100486.0 + "value": 2103565.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.4915339926948725 + "value": 1.5934847442910576 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2702623643162184 + "value": 0.0746722923682766 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -3790,13 +4079,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 96.68083984229285 + "value": 98.69823358023744 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.89533842234344 + "value": 99.96528145553604 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -3808,7 +4097,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4848615424.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -3820,43 +4109,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 362807296.0 + "value": 3234856960.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 69206016.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 127926272.0 + "value": 1294991360.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 174850048.0 + "value": 500957184.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.61142844083994 + "value": 45.37304425318255 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -3868,13 +4157,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 92.34620468132798 + "value": 50.993989294257226 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.7425463811280384 + "value": 1.0333254666560912 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -3886,7 +4175,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 60.15251231803256 + "value": 95.16627270211275 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -3905,30 +4194,47 @@ "time" ], "times": { - "compilation": 13338.646, - "data": 64127.213, - "framework": 197177.687, - "kernel_overhead": 29192.916, - "profiling_overhead": 54139.402, - "profiling_runs": 49718.156, + "compilation_time": 13765.721, + "data": 64645.271, + "framework": 1801194.245, + "kernel_overhead": 827456.625, + "profiling_overhead": 53769.655, + "profiling_runs": 855322.694, "runtimes": [ - 1794.976 + 3267.616 ], - "search_algorithm": 21.745, - "validation": 14.304 + "search_algorithm": 23.666, + "validation": 16.678 }, - "timestamp": "2026-01-27 09:24:58 UTC" + "timestamp": "2026-03-13 09:39:39 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 56 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -3936,61 +4242,61 @@ { "name": "time", "unit": "", - "value": 1812.672 + "value": 5413.408 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 57.379347278225815 + "value": 9.05261934283943 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097296.0 + "value": 924.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837036.0 + "value": 1868116.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.719402442311702 + "value": 1.1585041074807547 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2128405.0 + "value": 91145.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098991.0 + "value": 2103229.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.500687384174538 + "value": 0.47671469553645895 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2706739093083806 + "value": 0.022342041602234203 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -4020,13 +4326,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.86159082681552 + "value": 73.90356864956635 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.91431268420496 + "value": 99.97779295974777 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -4038,7 +4344,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4848615424.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -4050,43 +4356,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 362807296.0 + "value": 4706533376.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 69206016.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 127926272.0 + "value": 1121452032.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 174850048.0 + "value": 852869120.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.66968122115258 + "value": 42.99004423503827 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -4098,13 +4404,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 92.46926213925651 + "value": 30.51110974867372 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.747533573026509 + "value": 0.36872556947249735 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -4116,7 +4422,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 60.23261752459488 + "value": 96.93991939811305 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -4135,30 +4441,47 @@ "time" ], "times": { - "compilation": 12951.859, - "data": 64835.14, - "framework": 198413.314, - "kernel_overhead": 29155.627, - "profiling_overhead": 54793.885, - "profiling_runs": 49628.662, + "compilation_time": 13498.083, + "data": 57824.442, + "framework": 2998145.2290000003, + "kernel_overhead": 1427262.613, + "profiling_overhead": 47535.89, + "profiling_runs": 1465522.284, "runtimes": [ - 1812.672 + 5413.408 ], - "search_algorithm": 22.746, - "validation": 16.995 + "search_algorithm": 26.898, + "validation": 14.216 }, - "timestamp": "2026-01-27 09:24:58 UTC" + "timestamp": "2026-03-13 09:39:40 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -4166,61 +4489,61 @@ { "name": "time", "unit": "", - "value": 1874.208 + "value": 3242.944 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 56.40901878416553 + "value": 15.070156728064232 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098824.0 + "value": 6516.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837280.0 + "value": 1872272.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.671308175583225 + "value": 1.9070522582035985 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2130027.0 + "value": 65532.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099518.0 + "value": 2107809.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.500956518190572 + "value": 1.5874798115324933 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2707194985344289 + "value": 0.07439211777734903 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -4250,13 +4573,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.70923893972369 + "value": 98.74010098233705 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9178648982362 + "value": 99.96617415286794 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -4268,7 +4591,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4848615424.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -4280,43 +4603,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 362807296.0 + "value": 2295332864.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 69206016.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 127926272.0 + "value": 1429209088.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 174850048.0 + "value": 471597056.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.67341114464659 + "value": 42.02712046325702 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -4328,13 +4651,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 92.48154864717917 + "value": 50.80220334498012 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.7480315125565773 + "value": 1.0294391791097046 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -4346,7 +4669,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 60.24067273135035 + "value": 89.25186800331588 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -4365,30 +4688,47 @@ "time" ], "times": { - "compilation": 13268.024, - "data": 63147.229, - "framework": 195912.894, - "kernel_overhead": 29349.356, - "profiling_overhead": 53206.932, - "profiling_runs": 50209.377, + "compilation_time": 13437.305, + "data": 57434.97, + "framework": 1620592.3730000001, + "kernel_overhead": 744021.449, + "profiling_overhead": 47456.893, + "profiling_runs": 771679.061, "runtimes": [ - 1874.208 + 3242.944 ], - "search_algorithm": 24.411, - "validation": 14.448 + "search_algorithm": 24.538, + "validation": 13.322 }, - "timestamp": "2026-01-27 09:24:58 UTC" + "timestamp": "2026-03-13 09:39:41 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -4396,61 +4736,61 @@ { "name": "time", "unit": "", - "value": 1860.032 + "value": 6152.32 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 57.329910215854106 + "value": 7.960563804610989 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102560.0 + "value": 808.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1841004.0 + "value": 1867828.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.774905878019141 + "value": 1.021891986368899 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2133760.0 + "value": 101230.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100403.0 + "value": 2099637.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.594743782113634 + "value": 0.4193237777043955 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2746429953966901 + "value": 0.01965114798494653 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -4480,13 +4820,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.58485780846348 + "value": 82.00754621887451 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.93248890520972 + "value": 99.9707325752225 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -4498,7 +4838,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4848615424.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -4510,43 +4850,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 362807296.0 + "value": 6215958528.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 69206016.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 127926272.0 + "value": 1355284480.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 174850048.0 + "value": 936706048.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 43.28453605973394 + "value": 42.84567406055009 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -4558,13 +4898,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 93.80813999771843 + "value": 26.838222238618968 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.8017947362356592 + "value": 0.3243388673856541 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -4576,7 +4916,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 61.10478840161182 + "value": 93.65250742515222 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -4595,30 +4935,47 @@ "time" ], "times": { - "compilation": 13003.798, - "data": 62419.829, - "framework": 194328.153, - "kernel_overhead": 29340.34, - "profiling_overhead": 52824.851, - "profiling_runs": 49743.133, + "compilation_time": 13696.832, + "data": 57523.789, + "framework": 3474573.912, + "kernel_overhead": 1663818.882, + "profiling_overhead": 47568.036, + "profiling_runs": 1705663.205, "runtimes": [ - 1860.032 + 6152.32 ], - "search_algorithm": 19.86, - "validation": 13.132 + "search_algorithm": 26.028, + "validation": 15.599 }, - "timestamp": "2026-01-27 09:24:58 UTC" + "timestamp": "2026-03-13 09:39:43 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 16, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -4626,61 +4983,61 @@ { "name": "time", "unit": "", - "value": 1731.04 + "value": 5820.832 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 60.035143824191174 + "value": 8.36579919043196 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098040.0 + "value": 508.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1863780.0 + "value": 1867156.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.963992687996251 + "value": 1.0720213682102864 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2117071.0 + "value": 96518.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099909.0 + "value": 2099287.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.7311426382365545 + "value": 0.4408757108742221 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.13980918685620702 + "value": 0.02066238619418991 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -4710,13 +5067,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.00429718077244 + "value": 82.01736147160477 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.91210903944672 + "value": 99.97510000019841 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -4728,7 +5085,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4580179968.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -4740,43 +5097,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 209715200.0 + "value": 5813305344.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 101187584.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 162529280.0 + "value": 986185728.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 164495360.0 + "value": 897892352.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.57796486606208 + "value": 43.94594429436074 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -4788,13 +5145,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.52703118580123 + "value": 28.21807091673623 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.3088808807115044 + "value": 0.3410142847603622 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -4806,7 +5163,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 58.5395641892962 + "value": 94.49934478493837 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -4825,30 +5182,47 @@ "time" ], "times": { - "compilation": 13477.646, - "data": 65343.543, - "framework": 204996.081, - "kernel_overhead": 32143.586, - "profiling_overhead": 55115.64, - "profiling_runs": 52393.312, + "compilation_time": 13021.254, + "data": 57564.108, + "framework": 3381161.4390000002, + "kernel_overhead": 1617909.261, + "profiling_overhead": 47570.282, + "profiling_runs": 1658117.788, "runtimes": [ - 1731.04 + 5820.832 ], - "search_algorithm": 23.539, - "validation": 15.464 + "search_algorithm": 23.105, + "validation": 16.408 }, - "timestamp": "2026-01-27 09:24:58 UTC" + "timestamp": "2026-03-13 09:39:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 256 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 44 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 1 }, "correctness": 1, "invalidity": "correct", @@ -4856,61 +5230,61 @@ { "name": "time", "unit": "", - "value": 1793.632 + "value": 3718.144 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 59.07460169233115 + "value": 13.274145030646006 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2100992.0 + "value": 7856.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1866704.0 + "value": 1871644.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.90357078888712 + "value": 1.6937906639914093 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2118190.0 + "value": 70002.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099720.0 + "value": 2100398.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.7306526933032873 + "value": 22.50579957544309 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.13981279785456535 + "value": 1.054723321279792 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -4940,13 +5314,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.54815805545952 + "value": 70.29932076541618 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95209544260113 + "value": 99.96244008680353 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -4958,7 +5332,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4580179968.0 + "value": 8589934592.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -4970,43 +5344,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 209715200.0 + "value": 587202560.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 117440512.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 162529280.0 + "value": 234881024.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 50331648.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 164495360.0 + "value": 341311488.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.56175134643332 + "value": 40.00648153038042 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -5018,13 +5392,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.49128134439913 + "value": 45.01843726056193 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.3080168098377722 + "value": 11.958022397336764 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -5036,7 +5410,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 58.51772191696223 + "value": 57.240932863416795 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -5055,30 +5429,47 @@ "time" ], "times": { - "compilation": 13204.42, - "data": 61083.556, - "framework": 197219.311, - "kernel_overhead": 32146.712, - "profiling_overhead": 51453.804, - "profiling_runs": 52535.239, + "compilation_time": 13457.403, + "data": 58061.018, + "framework": 237839.14899999998, + "kernel_overhead": 50812.939, + "profiling_overhead": 48083.597, + "profiling_runs": 80881.595, "runtimes": [ - 1793.632 + 3718.144 ], - "search_algorithm": 25.177, - "validation": 13.911 + "search_algorithm": 37.562, + "validation": 19.367 }, - "timestamp": "2026-01-27 09:24:58 UTC" + "timestamp": "2026-03-13 09:39:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -5086,61 +5477,61 @@ { "name": "time", "unit": "", - "value": 1751.904 + "value": 2139.776 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 58.412663206437166 + "value": 23.263861287758345 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2100768.0 + "value": 2652.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839068.0 + "value": 1870368.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.901710433264528 + "value": 2.9347261086783023 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2131555.0 + "value": 38937.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099819.0 + "value": 2099440.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.736208333179525 + "value": 19.91093989344764 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.14003613497532386 + "value": 0.9329016663906669 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -5170,13 +5561,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.23922989484699 + "value": 94.7530921155861 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.93148693919164 + "value": 99.93916456977948 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -5188,7 +5579,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4580179968.0 + "value": 6442450944.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -5200,43 +5591,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 209715200.0 + "value": 704643072.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 142606336.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 162529280.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 164495360.0 + "value": 256901120.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.63581394364581 + "value": 50.485067492657066 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -5248,13 +5639,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.66354349821613 + "value": 79.65606797698744 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.312180372637548 + "value": 10.734899785961199 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -5266,7 +5657,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 58.62322121652317 + "value": 76.23439380528096 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -5285,30 +5676,47 @@ "time" ], "times": { - "compilation": 12953.606, - "data": 62892.414, - "framework": 199979.89399999997, - "kernel_overhead": 32148.156, - "profiling_overhead": 52283.25, - "profiling_runs": 52656.074, + "compilation_time": 13768.848, + "data": 57583.361, + "framework": 247816.405, + "kernel_overhead": 60061.414, + "profiling_overhead": 47990.684, + "profiling_runs": 82180.946, "runtimes": [ - 1751.904 + 2139.776 ], - "search_algorithm": 26.307, - "validation": 11.842 + "search_algorithm": 29.641, + "validation": 15.903 }, - "timestamp": "2026-01-27 09:24:59 UTC" + "timestamp": "2026-03-13 09:39:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 39 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -5316,61 +5724,61 @@ { "name": "time", "unit": "", - "value": 1741.92 + "value": 1841.952 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 58.78675066819397 + "value": 26.397338488715842 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2101376.0 + "value": 3056.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1840064.0 + "value": 1871400.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.939718160306523 + "value": 3.3236052548179416 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2132248.0 + "value": 36078.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100488.0 + "value": 2099760.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.737182801933645 + "value": 11.290473997700856 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.14005667690686743 + "value": 0.5288694479440943 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -5400,13 +5808,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.18895211253626 + "value": 95.82812139003512 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95041333417525 + "value": 99.89685347049594 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -5418,7 +5826,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4580179968.0 + "value": 5368709120.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -5430,43 +5838,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 209715200.0 + "value": 390070272.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 71303168.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 162529280.0 + "value": 88080384.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 164495360.0 + "value": 197394432.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.633757121104715 + "value": 45.839373795048495 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -5478,13 +5886,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.65945910473754 + "value": 90.35358236695286 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.312081653166264 + "value": 6.264750339896145 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -5496,7 +5904,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 58.620782419172954 + "value": 66.44307378033095 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -5515,30 +5923,47 @@ "time" ], "times": { - "compilation": 13432.248, - "data": 64535.628, - "framework": 202649.111, - "kernel_overhead": 31862.656, - "profiling_overhead": 54143.116, - "profiling_runs": 52107.711, + "compilation_time": 13366.617, + "data": 58871.505, + "framework": 197588.87600000002, + "kernel_overhead": 34328.733, + "profiling_overhead": 48832.301, + "profiling_runs": 55556.337, "runtimes": [ - 1741.92 + 1841.952 ], - "search_algorithm": 21.493, - "validation": 11.876 + "search_algorithm": 18.768, + "validation": 14.379 }, - "timestamp": "2026-01-27 09:24:59 UTC" + "timestamp": "2026-03-13 09:39:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -5546,61 +5971,61 @@ { "name": "time", "unit": "", - "value": 1790.72 + "value": 1837.728 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 58.77111421969292 + "value": 26.500539961683057 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2101156.0 + "value": 4684.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838032.0 + "value": 1872080.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.927101465448163 + "value": 3.4309140942649607 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2129914.0 + "value": 37341.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100843.0 + "value": 2101235.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.7379921650233197 + "value": 5.797206092933397 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.14007465404535668 + "value": 0.2715322979912289 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -5630,13 +6055,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.7732647224841 + "value": 98.359689151663 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.91747366618162 + "value": 99.9109352296694 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -5648,7 +6073,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4580179968.0 + "value": 4831838208.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -5660,43 +6085,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 209715200.0 + "value": 362807296.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 69206016.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 162529280.0 + "value": 127926272.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 164495360.0 + "value": 173801472.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.650539434751344 + "value": 42.62338692899004 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -5708,13 +6133,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.70327756795102 + "value": 92.76564589046416 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.3131407419988164 + "value": 3.3971794149339902 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -5726,7 +6151,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 58.647622556683174 + "value": 60.06336231713292 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -5745,30 +6170,47 @@ "time" ], "times": { - "compilation": 12985.032, - "data": 64956.891, - "framework": 204811.138, - "kernel_overhead": 32517.953, - "profiling_overhead": 54602.995, - "profiling_runs": 52733.299, + "compilation_time": 13577.951, + "data": 58117.824, + "framework": 185445.572, + "kernel_overhead": 29103.463, + "profiling_overhead": 48310.23, + "profiling_runs": 49914.055, "runtimes": [ - 1790.72 + 1837.728 ], - "search_algorithm": 18.233, - "validation": 14.223 + "search_algorithm": 20.836, + "validation": 12.404 }, - "timestamp": "2026-01-27 09:24:59 UTC" + "timestamp": "2026-03-13 09:39:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -5776,61 +6218,61 @@ { "name": "time", "unit": "", - "value": 1754.944 + "value": 1755.424 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 59.67388058263651 + "value": 28.092722892028878 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2100600.0 + "value": 932.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1863808.0 + "value": 1872084.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.9769104000135185 + "value": 3.556355924168465 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2123628.0 + "value": 29802.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103685.0 + "value": 2100314.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.243490519241941 + "value": 2.9952640837335727 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.07010829472185184 + "value": 0.1402491568272737 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -5860,13 +6302,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.39392488448567 + "value": 98.3595737905309 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9594429676896 + "value": 99.87680939730984 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -5878,7 +6320,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4445962240.0 + "value": 4563402752.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -5890,43 +6332,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 138412032.0 + "value": 209715200.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 33554432.0 + "value": 67108864.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 83361792.0 + "value": 160432128.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 151257088.0 + "value": 163381248.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.167954276302055 + "value": 39.528378494386686 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -5938,13 +6380,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.7600291528657 + "value": 95.86151677434778 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5313188255646248 + "value": 1.9425063213551914 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -5956,7 +6398,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 53.95971070078721 + "value": 58.3467034570093 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -5975,30 +6417,47 @@ "time" ], "times": { - "compilation": 13505.473, - "data": 63983.618, - "framework": 182594.415, - "kernel_overhead": 22204.429, - "profiling_overhead": 53752.883, - "profiling_runs": 42653.485, + "compilation_time": 13604.882, + "data": 59623.043, + "framework": 194224.927, + "kernel_overhead": 32008.704, + "profiling_overhead": 50071.8, + "profiling_runs": 52521.38, "runtimes": [ - 1754.944 + 1755.424 ], - "search_algorithm": 26.005, - "validation": 17.642 + "search_algorithm": 21.328, + "validation": 14.487 }, - "timestamp": "2026-01-27 09:24:59 UTC" + "timestamp": "2026-03-13 09:39:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -6006,61 +6465,61 @@ { "name": "time", "unit": "", - "value": 1722.816 + "value": 1795.904 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 60.675194965111515 + "value": 27.867701738997784 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103752.0 + "value": 924.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1869784.0 + "value": 1869788.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.988844269227535 + "value": 3.52133640501779 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2124136.0 + "value": 29250.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103583.0 + "value": 2099630.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.24328586246011 + "value": 1.4943379101412644 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06997631238045428 + "value": 0.06996971395551867 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -6090,13 +6549,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.09543635078947 + "value": 81.7074426806898 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.81994859349207 + "value": 99.8849699867053 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -6108,7 +6567,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4445962240.0 + "value": 4429185024.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -6132,13 +6591,13 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 83361792.0 + "value": 82837504.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -6150,13 +6609,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 151257088.0 + "value": 150192128.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.14771877472932 + "value": 37.93239994961857 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -6168,13 +6627,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.71332502470273 + "value": 95.6419997923446 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5305719699995188 + "value": 1.1558298314748676 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -6186,7 +6645,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 53.933393444344205 + "value": 53.51376338360871 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -6205,30 +6664,47 @@ "time" ], "times": { - "compilation": 13400.194, - "data": 65480.215, - "framework": 184422.36899999998, - "kernel_overhead": 22144.492, - "profiling_overhead": 54497.786, - "profiling_runs": 42299.876, + "compilation_time": 13422.813, + "data": 59593.235, + "framework": 174816.108, + "kernel_overhead": 22388.917, + "profiling_overhead": 50047.485, + "profiling_runs": 42786.471, "runtimes": [ - 1722.816 + 1795.904 ], - "search_algorithm": 22.003, - "validation": 18.784 + "search_algorithm": 29.27, + "validation": 14.876 }, - "timestamp": "2026-01-27 09:24:59 UTC" + "timestamp": "2026-03-13 09:39:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 19 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -6236,61 +6712,61 @@ { "name": "time", "unit": "", - "value": 1704.864 + "value": 5159.36 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 60.017881139718774 + "value": 9.518882471592383 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104368.0 + "value": 5808.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1840180.0 + "value": 1872796.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 7.029281433075274 + "value": 1.2159524639114718 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2134800.0 + "value": 96144.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2104536.0 + "value": 2107595.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.247705373827939 + "value": 7.954727189749554 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.07018646134689693 + "value": 0.37276893786334964 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -6320,13 +6796,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.53105567311181 + "value": 97.16883784096825 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.90219382669862 + "value": 99.96496214103395 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -6338,7 +6814,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4445962240.0 + "value": 6979321856.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -6350,43 +6826,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 138412032.0 + "value": 8212447232.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 33554432.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 83361792.0 + "value": 3321888768.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 151257088.0 + "value": 715390976.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.230691048060685 + "value": 36.26446962192443 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -6398,13 +6874,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.921732601652 + "value": 31.820765345889647 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.533904659523488 + "value": 4.288345329817159 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -6416,7 +6892,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 54.05082879455782 + "value": 84.80406107929313 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -6435,30 +6911,47 @@ "time" ], "times": { - "compilation": 13571.813, - "data": 64392.441, - "framework": 182234.56999999998, - "kernel_overhead": 22074.925, - "profiling_overhead": 53486.564, - "profiling_runs": 42280.64, + "compilation_time": 13026.26, + "data": 57987.826, + "framework": 2065597.763, + "kernel_overhead": 961263.118, + "profiling_overhead": 47990.466, + "profiling_runs": 998356.353, "runtimes": [ - 1704.864 + 5159.36 ], - "search_algorithm": 24.73, - "validation": 14.987 + "search_algorithm": 25.943, + "validation": 15.255 }, - "timestamp": "2026-01-27 09:24:59 UTC" + "timestamp": "2026-03-13 09:39:47 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 22 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -6466,61 +6959,61 @@ { "name": "time", "unit": "", - "value": 1730.848 + "value": 5756.032 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 59.626270571151984 + "value": 8.55675457074028 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102144.0 + "value": 8612.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839868.0 + "value": 1873644.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.981288753589351 + "value": 1.1096452493547357 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2131815.0 + "value": 107082.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102732.0 + "value": 2109418.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.248313651121233 + "value": 3.624640963810151 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.0701807611505624 + "value": 0.16987352834251276 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -6550,13 +7043,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.50586725639425 + "value": 98.66702736100524 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.89632695839047 + "value": 99.97499964747925 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -6568,7 +7061,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4445962240.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -6580,43 +7073,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 138412032.0 + "value": 10049552384.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 33554432.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 83361792.0 + "value": 1392508928.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 151257088.0 + "value": 786956288.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.22787235707018 + "value": 33.82803292958803 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -6628,13 +7121,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.91957529877905 + "value": 28.99899868902192 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5338701616381905 + "value": 2.0106727606646055 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -6646,7 +7139,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 54.04961317839785 + "value": 85.01505725900279 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -6665,30 +7158,47 @@ "time" ], "times": { - "compilation": 13335.483, - "data": 65858.765, - "framework": 184918.30299999999, - "kernel_overhead": 22088.422, - "profiling_overhead": 54717.279, - "profiling_runs": 42253.837, + "compilation_time": 13985.204, + "data": 62166.857, + "framework": 2654925.2520000003, + "kernel_overhead": 1250885.075, + "profiling_overhead": 50871.956, + "profiling_runs": 1291001.364, "runtimes": [ - 1730.848 + 5756.032 ], - "search_algorithm": 19.03, - "validation": 11.66 + "search_algorithm": 25.914, + "validation": 17.771 }, - "timestamp": "2026-01-27 09:24:59 UTC" + "timestamp": "2026-03-13 09:39:48 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 32, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -6696,61 +7206,61 @@ { "name": "time", "unit": "", - "value": 1725.632 + "value": 8055.04 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 59.60361721079741 + "value": 6.260086455331412 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2100492.0 + "value": 15192.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839412.0 + "value": 1931144.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.971019940782175 + "value": 49.266592908175404 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2129043.0 + "value": 1200427.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099494.0 + "value": 138417546.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.2475482172929455 + "value": 1.2783202307285508 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.0701891875064617 + "value": 0.059691113062925016 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -6768,25 +7278,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.27653118566319 + "value": 97.51032740096758 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.8957761537262 + "value": 99.83519608176032 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -6798,7 +7308,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4445962240.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -6810,43 +7320,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 138412032.0 + "value": 3982491648.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 33554432.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 2319450112.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 83361792.0 + "value": 155189248.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 151257088.0 + "value": 395247616.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.23608453242712 + "value": 10.671041326042738 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -6858,13 +7368,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.93162096732067 + "value": 20.40820010556123 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5340627864647227 + "value": 11.05112007669307 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -6876,7 +7386,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 54.05640077856311 + "value": 30.049609096220436 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -6895,30 +7405,47 @@ "time" ], "times": { - "compilation": 12820.868, - "data": 65937.843, - "framework": 185535.539, - "kernel_overhead": 22051.137, - "profiling_overhead": 55328.782, - "profiling_runs": 42217.777, + "compilation_time": 13830.299, + "data": 59164.738, + "framework": 894372.412, + "kernel_overhead": 367136.288, + "profiling_overhead": 49041.685, + "profiling_runs": 419029.701, "runtimes": [ - 1725.632 + 8055.04 ], - "search_algorithm": 22.698, - "validation": 15.01 + "search_algorithm": 25.595, + "validation": 14.667 }, - "timestamp": "2026-01-27 09:25:0 UTC" + "timestamp": "2026-03-13 09:39:48 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -6926,61 +7453,61 @@ { "name": "time", "unit": "", - "value": 5245.76 + "value": 8502.048 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 19.658575546556662 + "value": 6.3602487868904625 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2112192.0 + "value": 22976.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1877096.0 + "value": 2023864.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.3279177803376285 + "value": 54.56646998103 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2190544.0 + "value": 18157608.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103788.0 + "value": 138422573.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.09877989895043 + "value": 0.6310957584696835 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.3680333338524776 + "value": 0.029443231087911578 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -6998,25 +7525,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.78385594024182 + "value": 89.90475925765347 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96959227253545 + "value": 99.93561734222605 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -7028,7 +7555,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6996099072.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -7040,7 +7567,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 7683964928.0 + "value": 3635412992.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -7052,31 +7579,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 2252341248.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 3850371072.0 + "value": 77594624.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 716439552.0 + "value": 367034368.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.90008102248743 + "value": 9.088813533163375 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -7088,13 +7615,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.4150637623831 + "value": 20.11286161754474 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.356385795174219 + "value": 10.562198569174496 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -7106,7 +7633,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 83.8455066068534 + "value": 27.500840848582143 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -7125,30 +7652,47 @@ "time" ], "times": { - "compilation": 13194.803, - "data": 64855.95, - "framework": 2051583.2650000001, - "kernel_overhead": 947772.671, - "profiling_overhead": 53638.459, - "profiling_runs": 985316.185, + "compilation_time": 13984.769, + "data": 60539.798, + "framework": 867998.5009999999, + "kernel_overhead": 352153.604, + "profiling_overhead": 50124.313, + "profiling_runs": 405180.786, "runtimes": [ - 5245.76 + 8502.048 ], - "search_algorithm": 24.132, - "validation": 14.706 + "search_algorithm": 25.173, + "validation": 15.782 }, - "timestamp": "2026-01-27 09:25:1 UTC" + "timestamp": "2026-03-13 09:39:49 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -7156,61 +7700,61 @@ { "name": "time", "unit": "", - "value": 5257.984 + "value": 9767.36 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 19.784865036509395 + "value": 6.242474160206719 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097604.0 + "value": 23204.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1870056.0 + "value": 2296000.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.3211492610706386 + "value": 77.27225884050151 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2172143.0 + "value": 121369623.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099302.0 + "value": 138425733.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.098311849560526 + "value": 0.2718645291534466 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.36803655805849333 + "value": 0.012555228753141336 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -7228,25 +7772,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.12596954402618 + "value": 95.26132949647634 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96958498640417 + "value": 98.42321466598666 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -7258,7 +7802,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6996099072.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -7270,7 +7814,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 7683964928.0 + "value": 3461873664.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -7282,31 +7826,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 2218786816.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 3850371072.0 + "value": 38797312.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 716439552.0 + "value": 352927744.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.89999755322732 + "value": 7.250918390399337 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -7318,13 +7862,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.415341267975933 + "value": 17.416696236211198 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.356424277395099 + "value": 9.00387067387139 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -7336,7 +7880,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 83.84626949375024 + "value": 22.899027927000642 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -7355,30 +7899,47 @@ "time" ], "times": { - "compilation": 13325.61, - "data": 61449.316, - "framework": 2043767.9100000001, - "kernel_overhead": 946687.976, - "profiling_overhead": 51322.06, - "profiling_runs": 984308.558, + "compilation_time": 14240.65, + "data": 59715.488, + "framework": 853463.687, + "kernel_overhead": 341872.934, + "profiling_overhead": 49166.208, + "profiling_runs": 402709.057, "runtimes": [ - 5257.984 + 9767.36 ], - "search_algorithm": 20.763, - "validation": 15.178 + "search_algorithm": 26.958, + "validation": 16.798 }, - "timestamp": "2026-01-27 09:25:2 UTC" + "timestamp": "2026-03-13 09:39:49 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 23 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -7386,61 +7947,61 @@ { "name": "time", "unit": "", - "value": 5287.072 + "value": 3676.416 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 19.505866068028936 + "value": 13.975715828769589 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097340.0 + "value": 12896.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837040.0 + "value": 1875136.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.3294566335991944 + "value": 1.8259583311094798 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2183753.0 + "value": 72316.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098982.0 + "value": 2105783.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.098909629511681 + "value": 6.059545600969919 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.3680434062390291 + "value": 0.2839379241637574 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -7470,13 +8031,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.66711717460329 + "value": 98.14280173270603 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96858783279289 + "value": 99.95327815379808 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -7488,7 +8049,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6996099072.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -7500,43 +8061,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 7683964928.0 + "value": 4945084416.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 3850371072.0 + "value": 1665138688.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 716439552.0 + "value": 484966400.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.900946827675696 + "value": 41.379535780047014 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -7548,13 +8109,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.416239187314186 + "value": 48.481390457967585 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.356548793553334 + "value": 3.3615026587067365 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -7566,7 +8127,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 83.84864446961473 + "value": 87.58914272849063 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -7585,30 +8146,47 @@ "time" ], "times": { - "compilation": 13579.319, - "data": 61943.681, - "framework": 2044157.386, - "kernel_overhead": 946643.329, - "profiling_overhead": 51654.682, - "profiling_runs": 983915.694, + "compilation_time": 13640.987, + "data": 58259.395, + "framework": 1342713.919, + "kernel_overhead": 603484.534, + "profiling_overhead": 48165.951, + "profiling_runs": 632804.039, "runtimes": [ - 5287.072 + 3676.416 ], - "search_algorithm": 35.031, - "validation": 19.409 + "search_algorithm": 26.265, + "validation": 15.467 }, - "timestamp": "2026-01-27 09:25:3 UTC" + "timestamp": "2026-03-13 09:39:50 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -7616,61 +8194,61 @@ { "name": "time", "unit": "", - "value": 5290.304 + "value": 4210.272 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 19.517159755803476 + "value": 11.862141514912011 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098640.0 + "value": 10140.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837968.0 + "value": 1872528.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.330943160150712 + "value": 1.5191505520101931 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2185219.0 + "value": 81002.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100213.0 + "value": 2101653.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.09924838159135 + "value": 2.5397727346698145 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.3680463963696152 + "value": 0.11900704081409803 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -7700,13 +8278,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.58195526341959 + "value": 98.8669592396122 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9695387778197 + "value": 99.95594300731703 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -7718,7 +8296,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6996099072.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -7730,43 +8308,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 7683964928.0 + "value": 5463080960.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 3850371072.0 + "value": 1369440256.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 716439552.0 + "value": 596246528.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.90112260578342 + "value": 40.966309128121544 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -7778,13 +8356,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.41619558067693 + "value": 40.638974241119975 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.356542746539184 + "value": 1.4882436855878898 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -7796,7 +8374,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 83.84854961963998 + "value": 90.26750473247913 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -7815,30 +8393,47 @@ "time" ], "times": { - "compilation": 12812.341, - "data": 61448.954, - "framework": 2050034.003, - "kernel_overhead": 949746.544, - "profiling_overhead": 51463.216, - "profiling_runs": 987375.289, + "compilation_time": 13230.829, + "data": 60605.908, + "framework": 1926204.6260000002, + "kernel_overhead": 891593.946, + "profiling_overhead": 50489.035, + "profiling_runs": 923515.737, "runtimes": [ - 5290.304 + 4210.272 ], - "search_algorithm": 21.259, - "validation": 15.557 + "search_algorithm": 27.301, + "validation": 18.108 }, - "timestamp": "2026-01-27 09:25:4 UTC" + "timestamp": "2026-03-13 09:39:51 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -7846,61 +8441,61 @@ { "name": "time", "unit": "", - "value": 5349.504 + "value": 8303.808 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 19.42732514129996 + "value": 6.40172827953853 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104576.0 + "value": 4868.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837536.0 + "value": 2017988.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.3209273226654155 + "value": 57.23745756213624 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2190312.0 + "value": 25652550.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101158.0 + "value": 138423121.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.098158220455524 + "value": 0.6312728550604355 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.3679858040372277 + "value": 0.02945935043570734 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -7918,25 +8513,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 96.97785880981384 + "value": 91.7025527701918 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96542014707303 + "value": 99.48985471143669 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -7948,7 +8543,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6996099072.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -7960,43 +8555,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 7683964928.0 + "value": 1958739968.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 1178599424.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 3850371072.0 + "value": 143654912.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 716439552.0 + "value": 266371072.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.896491937166225 + "value": 8.976818138966234 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -8008,13 +8603,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.412317611074293 + "value": 20.2140375241675 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.356004981223193 + "value": 5.561821359799017 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -8026,7 +8621,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 83.83819900824565 + "value": 20.05891015741156 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -8045,30 +8640,47 @@ "time" ], "times": { - "compilation": 13965.338, - "data": 64447.491, - "framework": 2051829.1469999999, - "kernel_overhead": 947604.777, - "profiling_overhead": 54395.159, - "profiling_runs": 985381.72, + "compilation_time": 13511.5, + "data": 60082.512, + "framework": 548995.833, + "kernel_overhead": 193367.178, + "profiling_overhead": 49544.384, + "profiling_runs": 246001.759, "runtimes": [ - 5349.504 + 8303.808 ], - "search_algorithm": 21.735, - "validation": 11.549 + "search_algorithm": 26.803, + "validation": 20.728 }, - "timestamp": "2026-01-27 09:25:5 UTC" + "timestamp": "2026-03-13 09:39:52 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -8076,61 +8688,61 @@ { "name": "time", "unit": "", - "value": 6289.568 + "value": 9562.176 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.589757293428203 + "value": 6.225000697452207 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2120756.0 + "value": 6092.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1872380.0 + "value": 2278796.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.0182744976900633 + "value": 81.73561573473982 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2212600.0 + "value": 133187151.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2107493.0 + "value": 138415966.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.613005035480322 + "value": 0.2739006620849816 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.15935632907756314 + "value": 0.012789094100006218 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -8148,25 +8760,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.76699859556523 + "value": 98.78420895283412 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96838717110225 + "value": 99.56574674175764 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -8178,7 +8790,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -8190,43 +8802,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 9781116928.0 + "value": 1817706496.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 1145044992.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1665138688.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 788135936.0 + "value": 252264448.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.387698864082584 + "value": 7.233762909093501 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -8238,13 +8850,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.205413896850246 + "value": 17.537533789772603 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.9925840256482115 + "value": 4.681956347440512 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -8256,7 +8868,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.8764338250357 + "value": 16.481353021762146 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -8275,30 +8887,47 @@ "time" ], "times": { - "compilation": 12816.291, - "data": 59625.575, - "framework": 2625395.2309999997, - "kernel_overhead": 1237036.192, - "profiling_overhead": 49599.192, - "profiling_runs": 1279134.272, + "compilation_time": 13073.774, + "data": 60135.174, + "framework": 529899.45, + "kernel_overhead": 180467.546, + "profiling_overhead": 49902.893, + "profiling_runs": 239393.837, "runtimes": [ - 6289.568 + 9562.176 ], - "search_algorithm": 23.22, - "validation": 16.499 + "search_algorithm": 23.886, + "validation": 15.483 }, - "timestamp": "2026-01-27 09:25:6 UTC" + "timestamp": "2026-03-13 09:39:52 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -8306,61 +8935,61 @@ { "name": "time", "unit": "", - "value": 6101.152 + "value": 2606.656 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 17.08864585623214 + "value": 18.646266711484312 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2108616.0 + "value": 9032.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1871480.0 + "value": 1872612.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.02559165757116 + "value": 2.429362155272285 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2197426.0 + "value": 53484.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102827.0 + "value": 2105068.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.613077505219231 + "value": 4.066855499621977 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.15936527281170798 + "value": 0.19055846983781827 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -8390,13 +9019,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.6631450823399 + "value": 98.60720811885393 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97348056177815 + "value": 99.94983820247597 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -8408,7 +9037,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -8420,43 +9049,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 9781116928.0 + "value": 2776629248.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1665138688.0 + "value": 1642070016.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 788135936.0 + "value": 378208256.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.387779151626592 + "value": 45.248621371022594 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -8468,13 +9097,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.205554653457398 + "value": 65.07660129762041 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.9925943349700241 + "value": 2.3831763170515283 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -8486,7 +9115,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.87686341048293 + "value": 91.68959600722548 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -8505,30 +9134,47 @@ "time" ], "times": { - "compilation": 13416.071, - "data": 62867.898, - "framework": 2631977.717, - "kernel_overhead": 1237222.732, - "profiling_overhead": 52661.391, - "profiling_runs": 1279225.696, + "compilation_time": 13671.837, + "data": 60386.798, + "framework": 1068408.703, + "kernel_overhead": 466696.578, + "profiling_overhead": 49962.565, + "profiling_runs": 491362.762, "runtimes": [ - 6101.152 + 2606.656 ], - "search_algorithm": 24.035, - "validation": 15.018 + "search_algorithm": 83.174, + "validation": 15.4 }, - "timestamp": "2026-01-27 09:25:8 UTC" + "timestamp": "2026-03-13 09:39:52 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -8536,61 +9182,61 @@ { "name": "time", "unit": "", - "value": 6066.816 + "value": 3403.872 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 17.021339811835983 + "value": 14.967982685179496 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104252.0 + "value": 524.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837204.0 + "value": 1870584.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.026055794173669 + "value": 1.9047070231718157 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2204246.0 + "value": 54076.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100945.0 + "value": 2099384.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.6130771390551994 + "value": 1.5933653701416552 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.1593656345089629 + "value": 0.07466498281575212 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -8620,13 +9266,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.97606495883939 + "value": 98.59393961313472 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97376724268435 + "value": 99.95883320378765 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -8638,7 +9284,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -8650,43 +9296,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 9781116928.0 + "value": 3234856960.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1665138688.0 + "value": 1294991360.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 788135936.0 + "value": 500957184.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.388014737507046 + "value": 45.37124393274073 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -8698,13 +9344,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.205538385724147 + "value": 50.99228682635527 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.9925931434856554 + "value": 1.0332909684051483 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -8716,7 +9362,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.87679750622135 + "value": 95.16313122613202 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -8735,30 +9381,47 @@ "time" ], "times": { - "compilation": 13501.712, - "data": 62800.249, - "framework": 2636304.635, - "kernel_overhead": 1239649.319, - "profiling_overhead": 52746.612, - "profiling_runs": 1281108.455, + "compilation_time": 13395.907, + "data": 58106.009, + "framework": 1816746.463, + "kernel_overhead": 841553.396, + "profiling_overhead": 47703.216, + "profiling_runs": 869383.842, "runtimes": [ - 6066.816 + 3403.872 ], - "search_algorithm": 23.734, - "validation": 13.416 + "search_algorithm": 25.326, + "validation": 13.25 }, - "timestamp": "2026-01-27 09:25:9 UTC" + "timestamp": "2026-03-13 09:39:53 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 56 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -8766,61 +9429,61 @@ { "name": "time", "unit": "", - "value": 6071.776 + "value": 5476.512 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 17.05804454898232 + "value": 8.896491312566978 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110136.0 + "value": 19980.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839732.0 + "value": 1871508.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.0310810750738773 + "value": 1.1669972281057701 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2210176.0 + "value": 111029.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102700.0 + "value": 2103679.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.6131660458607695 + "value": 0.4767256536101042 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.15937069327548536 + "value": 0.022342026165770853 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -8850,13 +9513,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.74242570639092 + "value": 73.86720951220232 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97622850998778 + "value": 99.97500601732874 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -8868,7 +9531,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -8880,43 +9543,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 9781116928.0 + "value": 4706533376.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1665138688.0 + "value": 1121452032.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 788135936.0 + "value": 852869120.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.38805004693271 + "value": 42.991224419278794 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -8928,13 +9591,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.205732193593988 + "value": 30.511939207130567 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.9926073383979972 + "value": 0.36873559344554757 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -8946,7 +9609,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.87738467716619 + "value": 96.94255475410675 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -8965,30 +9628,47 @@ "time" ], "times": { - "compilation": 14062.015, - "data": 64368.068, - "framework": 2645266.66, - "kernel_overhead": 1243019.204, - "profiling_overhead": 53358.543, - "profiling_runs": 1284520.845, + "compilation_time": 13382.007, + "data": 59647.102, + "framework": 3040552.2479999997, + "kernel_overhead": 1446340.699, + "profiling_overhead": 49477.868, + "profiling_runs": 1485086.579, "runtimes": [ - 6071.776 + 5476.512 ], - "search_algorithm": 22.344, - "validation": 12.409 + "search_algorithm": 24.619, + "validation": 18.279 }, - "timestamp": "2026-01-27 09:25:11 UTC" + "timestamp": "2026-03-13 09:39:55 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -8996,61 +9676,61 @@ { "name": "time", "unit": "", - "value": 6163.232 + "value": 3259.232 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.917343412881817 + "value": 15.034516992981159 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2113072.0 + "value": 4204.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1841288.0 + "value": 1871180.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.040531597464955 + "value": 1.8994181248726427 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2211166.0 + "value": 57103.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2105540.0 + "value": 2102177.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.621036173480606 + "value": 1.587468515881214 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.15972317249761425 + "value": 0.07439025778209998 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -9080,13 +9760,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.47825269324801 + "value": 98.68602367249909 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97502822500704 + "value": 99.96579663018427 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -9098,7 +9778,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -9110,43 +9790,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 9781116928.0 + "value": 2295332864.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1665138688.0 + "value": 1429209088.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 788135936.0 + "value": 471597056.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.45092042393331 + "value": 42.026194324026925 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -9158,13 +9838,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.26623030127273 + "value": 50.801125009237715 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.9970383521439987 + "value": 1.0294173280680492 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -9176,7 +9856,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 80.0550221201698 + "value": 89.25000702586684 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -9195,30 +9875,47 @@ "time" ], "times": { - "compilation": 13119.551, - "data": 65073.81, - "framework": 2646378.6610000003, - "kernel_overhead": 1242182.133, - "profiling_overhead": 55140.34, - "profiling_runs": 1283982.378, + "compilation_time": 13403.242, + "data": 59489.271, + "framework": 1645560.0869999998, + "kernel_overhead": 754426.085, + "profiling_overhead": 49392.814, + "profiling_runs": 782251.917, "runtimes": [ - 6163.232 + 3259.232 ], - "search_algorithm": 23.264, - "validation": 14.273 + "search_algorithm": 24.036, + "validation": 16.376 }, - "timestamp": "2026-01-27 09:25:12 UTC" + "timestamp": "2026-03-13 09:39:56 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -9226,61 +9923,61 @@ { "name": "time", "unit": "", - "value": 8372.927 + "value": 6226.656 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.50791356252932 + "value": 7.887015447469977 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098096.0 + "value": 5816.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1929188.0 + "value": 1867200.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 48.22737008075276 + "value": 1.0270978028254016 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2986769.0 + "value": 105277.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138418787.0 + "value": 2101160.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.3869886120799175 + "value": 0.41932906661507807 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.0576859511658942 + "value": 0.019651875847355206 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -9298,25 +9995,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.80347159782782 + "value": 81.97553947736539 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.8767474347051 + "value": 99.97570943063904 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -9328,7 +10025,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -9340,43 +10037,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3709861888.0 + "value": 6215958528.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2336227328.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 427819008.0 + "value": 1355284480.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 396296192.0 + "value": 936706048.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.716743705108334 + "value": 42.84512996444503 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -9388,13 +10085,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 19.71443654673584 + "value": 26.83788023083479 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.752453917335906 + "value": 0.3243347342349419 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -9406,7 +10103,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.10506707660855 + "value": 93.6513139822236 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -9425,30 +10122,47 @@ "time" ], "times": { - "compilation": 13807.447, - "data": 60981.938, - "framework": 897746.2690000001, - "kernel_overhead": 366328.469, - "profiling_overhead": 50922.915, - "profiling_runs": 419512.947, + "compilation_time": 13911.353, + "data": 59725.602, + "framework": 3522754.5779999997, + "kernel_overhead": 1686152.16, + "profiling_overhead": 48671.242, + "profiling_runs": 1728205.574, "runtimes": [ - 8372.927 + 6226.656 ], - "search_algorithm": 36.353, - "validation": 14.051 + "search_algorithm": 24.132, + "validation": 13.743 }, - "timestamp": "2026-01-27 09:25:12 UTC" + "timestamp": "2026-03-13 09:39:58 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 16, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -9456,61 +10170,61 @@ { "name": "time", "unit": "", - "value": 8655.36 + "value": 5852.544 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.979340888875374 + "value": 8.389816318969448 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2101872.0 + "value": 11308.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1931988.0 + "value": 1872736.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 49.04614392377234 + "value": 1.080135320686978 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 3376217.0 + "value": 107593.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138424270.0 + "value": 2103178.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.4151636575543471 + "value": 0.4408730712638775 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.059314743831312494 + "value": 0.020661025014165783 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -9528,25 +10242,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.47321787140652 + "value": 81.98062186061263 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 100.9189829378639 + "value": 99.96918532458558 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -9558,7 +10272,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -9570,43 +10284,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3709861888.0 + "value": 5813305344.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 101187584.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2336227328.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 427819008.0 + "value": 986185728.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 396296192.0 + "value": 897892352.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.8894086491397 + "value": 43.94575174575264 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -9618,13 +10332,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.061735303278123 + "value": 28.217881401236966 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.941874186407063 + "value": 0.3410119944729565 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -9636,7 +10350,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.61781621006878 + "value": 94.49871148406723 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -9655,30 +10369,47 @@ "time" ], "times": { - "compilation": 13102.598, - "data": 65294.618, - "framework": 912863.9619999999, - "kernel_overhead": 369096.882, - "profiling_overhead": 55058.911, - "profiling_runs": 423413.551, + "compilation_time": 13853.837, + "data": 59305.4, + "framework": 3429615.082, + "kernel_overhead": 1640254.03, + "profiling_overhead": 49187.747, + "profiling_runs": 1680867.905, "runtimes": [ - 8655.36 + 5852.544 ], - "search_algorithm": 24.239, - "validation": 16.289 + "search_algorithm": 25.979, + "validation": 16.924 }, - "timestamp": "2026-01-27 09:25:13 UTC" + "timestamp": "2026-03-13 09:39:59 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 256 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 44 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 1 }, "correctness": 1, "invalidity": "correct", @@ -9686,61 +10417,61 @@ { "name": "time", "unit": "", - "value": 8136.192 + "value": 3686.048 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.833998928504242 + "value": 13.076858662364652 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109064.0 + "value": 416.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1907760.0 + "value": 1836052.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 49.49819756868054 + "value": 1.6878185285454446 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2803090.0 + "value": 61090.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138416730.0 + "value": 2099321.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.4172562677691027 + "value": 22.559762380196002 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.05929601329373381 + "value": 1.057200671268224 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -9758,25 +10489,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.82169696189482 + "value": 60.366339899006526 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 100.62058571811028 + "value": 99.95431328657357 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -9788,7 +10519,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 8589934592.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -9800,43 +10531,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3709861888.0 + "value": 587202560.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 117440512.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2336227328.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 427819008.0 + "value": 234881024.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 50331648.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 396296192.0 + "value": 341311488.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.914244008578772 + "value": 40.1037043487646 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -9848,13 +10579,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.114875824346296 + "value": 45.12784607048731 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.97085756630606 + "value": 11.987084112473193 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -9866,7 +10597,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.696248821107563 + "value": 57.38004328271467 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -9885,30 +10616,47 @@ "time" ], "times": { - "compilation": 13037.085, - "data": 63156.816, - "framework": 906775.8049999999, - "kernel_overhead": 369340.913, - "profiling_overhead": 52916.404, - "profiling_runs": 421361.672, + "compilation_time": 14887.282, + "data": 57246.511, + "framework": 244937.234, + "kernel_overhead": 55117.348, + "profiling_overhead": 47394.438, + "profiling_runs": 85178.937, "runtimes": [ - 8136.192 + 3686.048 ], - "search_algorithm": 21.467, - "validation": 12.266 + "search_algorithm": 24.012, + "validation": 17.475 }, - "timestamp": "2026-01-27 09:25:13 UTC" + "timestamp": "2026-03-13 09:40:0 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -9916,61 +10664,61 @@ { "name": "time", "unit": "", - "value": 8091.679 + "value": 2122.016 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.907896705973037 + "value": 22.956888350601176 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2117484.0 + "value": 504.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1912192.0 + "value": 1837340.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 49.70830635768764 + "value": 2.926737917787674 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 3038058.0 + "value": 36061.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138420407.0 + "value": 2099302.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.4227037380725192 + "value": 19.908599853805335 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.05944712864152063 + "value": 0.9327028657671272 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -9988,25 +10736,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.66539987271042 + "value": 95.60071689181278 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.52863262992805 + "value": 99.92998341934313 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -10018,7 +10766,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 6442450944.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -10030,43 +10778,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3709861888.0 + "value": 704643072.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 142606336.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2336227328.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 427819008.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 396296192.0 + "value": 256901120.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.049757274126256 + "value": 50.48047723925701 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -10078,13 +10826,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.387386061811686 + "value": 79.64641023852649 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.119487417501784 + "value": 10.733598254801421 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -10096,7 +10844,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 30.098585708009967 + "value": 76.22509632686369 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -10115,30 +10863,47 @@ "time" ], "times": { - "compilation": 13425.064, - "data": 61751.747, - "framework": 903025.9639999999, - "kernel_overhead": 368611.46, - "profiling_overhead": 51648.84, - "profiling_runs": 421013.917, + "compilation_time": 16149.166, + "data": 57099.446, + "framework": 249865.95799999998, + "kernel_overhead": 61613.246, + "profiling_overhead": 47376.441, + "profiling_runs": 83776.825, "runtimes": [ - 8091.679 + 2122.016 ], - "search_algorithm": 24.834, - "validation": 13.499 + "search_algorithm": 24.054, + "validation": 13.777 }, - "timestamp": "2026-01-27 09:25:14 UTC" + "timestamp": "2026-03-13 09:40:0 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 39 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -10146,61 +10911,61 @@ { "name": "time", "unit": "", - "value": 8199.263 + "value": 1943.936 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.697101708398959 + "value": 24.6764072224882 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2132144.0 + "value": 7548.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1912856.0 + "value": 1839716.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 49.67212986098539 + "value": 3.316109558208976 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2849497.0 + "value": 40549.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138419779.0 + "value": 2101052.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.4247375382530494 + "value": 11.291482658317713 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.059642775305308836 + "value": 0.5289068945454217 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -10218,25 +10983,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 96.32627096462475 + "value": 96.69094759596389 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.93215895197626 + "value": 99.908493213104 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -10248,7 +11013,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 5368709120.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -10260,43 +11025,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3709861888.0 + "value": 390070272.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 71303168.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2336227328.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 427819008.0 + "value": 88080384.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 396296192.0 + "value": 197394432.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.042002665450585 + "value": 45.83640720148217 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -10308,13 +11073,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.371887806402167 + "value": 90.34945255009201 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.111034511597277 + "value": 6.264463995172395 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -10326,7 +11091,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 30.07570503490274 + "value": 66.43997761016234 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -10345,30 +11110,47 @@ "time" ], "times": { - "compilation": 13285.056, - "data": 63038.863, - "framework": 907832.749, - "kernel_overhead": 369598.48, - "profiling_overhead": 52794.055, - "profiling_runs": 422401.351, + "compilation_time": 16585.475, + "data": 58463.755, + "framework": 200088.5, + "kernel_overhead": 36315.763, + "profiling_overhead": 47797.419, + "profiling_runs": 57511.563, "runtimes": [ - 8199.263 + 1943.936 ], - "search_algorithm": 22.2, - "validation": 18.276 + "search_algorithm": 34.075, + "validation": 17.789 }, - "timestamp": "2026-01-27 09:25:15 UTC" + "timestamp": "2026-03-13 09:40:0 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -10376,61 +11158,61 @@ { "name": "time", "unit": "", - "value": 8455.552 + "value": 1859.104 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.960110243988515 + "value": 26.431462741490343 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2113768.0 + "value": 1280.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2022784.0 + "value": 1837504.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 54.49860892758552 + "value": 3.411852801258955 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 19447772.0 + "value": 31743.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138425493.0 + "value": 2099261.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.7821785225803504 + "value": 5.7972407078998724 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.029177966361281067 + "value": 0.2715471491637572 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -10448,25 +11230,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 88.55573293074143 + "value": 98.58441865574966 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.33535564326992 + "value": 99.91467054025021 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -10478,7 +11260,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4831838208.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -10490,43 +11272,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3499098112.0 + "value": 362807296.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 69206016.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2269118464.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 127926272.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 368082944.0 + "value": 173801472.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.78034561818413 + "value": 42.62301908119608 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -10538,13 +11320,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.05210018827511 + "value": 92.76725137565269 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.608618434568399 + "value": 3.3972382095575937 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -10556,7 +11338,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.4960435574486 + "value": 60.06433547506145 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -10575,30 +11357,47 @@ "time" ], "times": { - "compilation": 13541.881, - "data": 61375.302, - "framework": 870898.0719999999, - "kernel_overhead": 352384.203, - "profiling_overhead": 51167.101, - "profiling_runs": 405971.466, + "compilation_time": 15880.836, + "data": 56781.868, + "framework": 184345.528, + "kernel_overhead": 29855.577, + "profiling_overhead": 47025.448, + "profiling_runs": 50682.635, "runtimes": [ - 8455.552 + 1859.104 ], - "search_algorithm": 42.072, - "validation": 16.435 + "search_algorithm": 23.846, + "validation": 16.564 }, - "timestamp": "2026-01-27 09:25:15 UTC" + "timestamp": "2026-03-13 09:40:0 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -10606,61 +11405,61 @@ { "name": "time", "unit": "", - "value": 8750.976 + "value": 1760.928 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.122179846204226 + "value": 27.35134240877293 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2101652.0 + "value": 5316.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2020696.0 + "value": 1846348.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 54.452670511307275 + "value": 3.4870137636192142 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 20141873.0 + "value": 36022.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138424134.0 + "value": 2100270.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.7853208275720851 + "value": 2.9955779503732844 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.029459221896794143 + "value": 0.14032899887113268 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -10678,25 +11477,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 89.87277790260781 + "value": 98.43439783629341 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 100.4700829980253 + "value": 99.92471505664608 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -10708,7 +11507,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4563402752.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -10720,25 +11519,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3499098112.0 + "value": 209715200.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 67108864.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2269118464.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 160432128.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -10750,13 +11549,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 368082944.0 + "value": 163381248.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.765424622710679 + "value": 39.53339082589415 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -10768,13 +11567,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.016733553682247 + "value": 95.87010565075911 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.589907619831404 + "value": 1.942680363528566 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -10786,7 +11585,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.447569089675888 + "value": 58.35187112942539 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -10805,30 +11604,47 @@ "time" ], "times": { - "compilation": 13166.119, - "data": 63835.1, - "framework": 874107.395, - "kernel_overhead": 350997.405, - "profiling_overhead": 53608.66, - "profiling_runs": 405666.23, + "compilation_time": 16843.16, + "data": 57912.55, + "framework": 193259.279, + "kernel_overhead": 33160.212, + "profiling_overhead": 48339.952, + "profiling_runs": 53846.565, "runtimes": [ - 8750.976 + 1760.928 ], - "search_algorithm": 52.093, - "validation": 16.195 + "search_algorithm": 29.214, + "validation": 16.99 }, - "timestamp": "2026-01-27 09:25:16 UTC" + "timestamp": "2026-03-13 09:40:0 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -10836,61 +11652,61 @@ { "name": "time", "unit": "", - "value": 8153.599 + "value": 1750.016 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 13.043141583933185 + "value": 27.67334691591673 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110224.0 + "value": 456.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1999672.0 + "value": 1833760.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 54.00006790835147 + "value": 3.525001794193909 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 16676094.0 + "value": 30348.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138417558.0 + "value": 2099076.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.7861217554769574 + "value": 1.4942277541359679 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.029448851373770464 + "value": 0.06999991910816639 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -10908,25 +11724,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 88.98535185292054 + "value": 81.74092345350775 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.50594791665793 + "value": 99.92786705723371 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -10938,7 +11754,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4429185024.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -10950,43 +11766,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3499098112.0 + "value": 138412032.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 33554432.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2269118464.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 82837504.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 368082944.0 + "value": 150192128.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.846354361996726 + "value": 37.93282632720237 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -10998,13 +11814,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.203565339965447 + "value": 95.64221243137006 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.688751487232695 + "value": 1.155832401209184 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -11016,7 +11832,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.703738488896423 + "value": 53.51388235973342 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -11035,30 +11851,47 @@ "time" ], "times": { - "compilation": 13022.808, - "data": 62257.937, - "framework": 869658.561, - "kernel_overhead": 351405.027, - "profiling_overhead": 52256.104, - "profiling_runs": 403739.493, + "compilation_time": 17923.512, + "data": 58516.471, + "framework": 175269.54299999998, + "kernel_overhead": 23853.835, + "profiling_overhead": 48588.41, + "profiling_runs": 44310.827, "runtimes": [ - 8153.599 + 1750.016 ], - "search_algorithm": 24.231, - "validation": 16.712 + "search_algorithm": 34.279, + "validation": 16.945 }, - "timestamp": "2026-01-27 09:25:16 UTC" + "timestamp": "2026-03-13 09:40:0 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 19 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -11066,61 +11899,61 @@ { "name": "time", "unit": "", - "value": 8434.784 + "value": 5159.68 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.935391472649185 + "value": 9.262054929325462 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103256.0 + "value": 14396.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1998716.0 + "value": 1840884.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 54.23330589891538 + "value": 1.2089685352549122 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 17275965.0 + "value": 103384.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138419030.0 + "value": 2102367.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.7897245646693194 + "value": 7.954741635171987 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.02943297756748656 + "value": 0.3727681886884133 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -11138,25 +11971,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.03306672492435 + "value": 97.81618024698192 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.06716168106088 + "value": 99.96402144141355 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -11168,7 +12001,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 6979321856.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -11180,7 +12013,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3499098112.0 + "value": 8212447232.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -11192,31 +12025,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2269118464.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 3321888768.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 368082944.0 + "value": 715390976.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.881340760663099 + "value": 36.26494213110673 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -11228,13 +12061,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.28211199868471 + "value": 31.821000838840863 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.730306811804141 + "value": 4.288377066171913 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -11246,7 +12079,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.811464286890814 + "value": 84.80466710601902 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -11265,30 +12098,47 @@ "time" ], "times": { - "compilation": 13442.369, - "data": 64343.311, - "framework": 872577.013, - "kernel_overhead": 350758.324, - "profiling_overhead": 53655.935, - "profiling_runs": 403819.443, + "compilation_time": 17160.528, + "data": 58178.988, + "framework": 2091507.7329999998, + "kernel_overhead": 973699.193, + "profiling_overhead": 48076.224, + "profiling_runs": 1011553.328, "runtimes": [ - 8434.784 + 5159.68 ], - "search_algorithm": 22.346, - "validation": 12.671 + "search_algorithm": 24.491, + "validation": 17.378 }, - "timestamp": "2026-01-27 09:25:17 UTC" + "timestamp": "2026-03-13 09:40:1 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 22 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -11296,61 +12146,61 @@ { "name": "time", "unit": "", - "value": 8152.672 + "value": 5877.28 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 13.0121243405383 + "value": 8.355767902570419 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098848.0 + "value": 16764.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1998976.0 + "value": 1841516.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 52.23717955696912 + "value": 1.1036310142077388 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 10515812.0 + "value": 112057.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138414975.0 + "value": 2106333.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.7955638478044696 + "value": 3.624655450980168 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.029637607171316877 + "value": 0.1698626501227523 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -11368,25 +12218,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.447162031492 + "value": 98.82166269113387 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.27520216246478 + "value": 99.96868014833953 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -11398,7 +12248,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -11410,7 +12260,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3499098112.0 + "value": 10049552384.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -11422,31 +12272,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2269118464.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 1392508928.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 368082944.0 + "value": 786956288.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.924155834174288 + "value": 33.827445703972344 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -11458,13 +12308,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.380322633348193 + "value": 28.998974723482846 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.782265416617562 + "value": 2.0106710989914864 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -11476,7 +12326,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.94613392279735 + "value": 85.01496766301393 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -11495,30 +12345,47 @@ "time" ], "times": { - "compilation": 13559.499, - "data": 65325.797, - "framework": 875179.474, - "kernel_overhead": 351249.128, - "profiling_overhead": 54970.814, - "profiling_runs": 403633.735, + "compilation_time": 15193.866, + "data": 58155.642, + "framework": 2676112.106, + "kernel_overhead": 1264764.749, + "profiling_overhead": 48054.662, + "profiling_runs": 1305137.053, "runtimes": [ - 8152.672 + 5877.28 ], - "search_algorithm": 21.202, - "validation": 14.168 + "search_algorithm": 25.863, + "validation": 18.713 }, - "timestamp": "2026-01-27 09:25:17 UTC" + "timestamp": "2026-03-13 09:40:3 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 32, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -11526,61 +12393,61 @@ { "name": "time", "unit": "", - "value": 9774.304 + "value": 8202.783 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.612756346823176 + "value": 6.107804881036976 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2117372.0 + "value": 5400.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2238768.0 + "value": 1908960.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 75.34743063764462 + "value": 48.82341327257352 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 119207773.0 + "value": 599763.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138418833.0 + "value": 138423999.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.4000727671414845 + "value": 1.2771077920393916 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012368593756920091 + "value": 0.05965367806699363 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -11610,13 +12477,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 93.73171676182082 + "value": 91.20397001426353 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 97.79079513359048 + "value": 99.72918536962962 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -11628,7 +12495,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -11640,7 +12507,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3393716224.0 + "value": 3982491648.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -11652,31 +12519,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2235564032.0 + "value": 2319450112.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 155189248.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 353976320.0 + "value": 395247616.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.08579249655818 + "value": 10.675083834799617 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -11688,13 +12555,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.268755530325887 + "value": 20.417081223247653 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.994846172839424 + "value": 11.055929236612133 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -11706,7 +12573,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.771946388020933 + "value": 30.062666280875366 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -11725,30 +12592,47 @@ "time" ], "times": { - "compilation": 14201.861, - "data": 61217.71, - "framework": 854834.784, - "kernel_overhead": 340739.437, - "profiling_overhead": 50833.033, - "profiling_runs": 402044.604, + "compilation_time": 14496.293, + "data": 58159.956, + "framework": 906075.623, + "kernel_overhead": 373569.689, + "profiling_overhead": 48008.939, + "profiling_runs": 426337.039, "runtimes": [ - 9774.304 + 8202.783 ], - "search_algorithm": 23.84, - "validation": 16.791 + "search_algorithm": 26.437, + "validation": 15.447 }, - "timestamp": "2026-01-27 09:25:18 UTC" + "timestamp": "2026-03-13 09:40:3 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -11756,61 +12640,61 @@ { "name": "time", "unit": "", - "value": 9996.544 + "value": 8950.4 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.438098011621879 + "value": 6.339470860011641 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2117660.0 + "value": 5892.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2231028.0 + "value": 2001576.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 76.89619322483003 + "value": 53.56458169832884 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 126036574.0 + "value": 15109514.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138425775.0 + "value": 138419528.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.3987601760645192 + "value": 0.6372941119808453 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012295247310574745 + "value": 0.02982395058854853 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -11840,13 +12724,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.49998387877464 + "value": 88.86789069958499 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 98.45157648474316 + "value": 99.20939704129323 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -11858,7 +12742,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -11870,7 +12754,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3393716224.0 + "value": 3635412992.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -11882,31 +12766,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2235564032.0 + "value": 2252341248.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 77594624.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 353976320.0 + "value": 367034368.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.994033868671147 + "value": 9.274760323601196 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -11918,13 +12802,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.05113477518177 + "value": 20.522064988099437 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.881493174524 + "value": 10.777090280615695 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -11936,7 +12820,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.48498880982553 + "value": 28.060334812913588 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -11955,30 +12839,47 @@ "time" ], "times": { - "compilation": 14077.196, - "data": 62236.684, - "framework": 863000.2660000001, - "kernel_overhead": 343024.501, - "profiling_overhead": 52066.906, - "profiling_runs": 405672.175, + "compilation_time": 14709.784, + "data": 58612.538, + "framework": 871709.282, + "kernel_overhead": 355534.581, + "profiling_overhead": 48524.934, + "profiling_runs": 409037.229, "runtimes": [ - 9996.544 + 8950.4 ], - "search_algorithm": 21.145, - "validation": 16.394 + "search_algorithm": 25.598, + "validation": 15.716 }, - "timestamp": "2026-01-27 09:25:18 UTC" + "timestamp": "2026-03-13 09:40:4 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -11986,61 +12887,61 @@ { "name": "time", "unit": "", - "value": 9523.776 + "value": 9336.512 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.351738699431401 + "value": 6.125476732955249 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2100760.0 + "value": 19928.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2171940.0 + "value": 2196244.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 76.11292013196372 + "value": 74.00556621893195 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 115629748.0 + "value": 103813504.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138415100.0 + "value": 138425220.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.4130193766636857 + "value": 0.2843989254953286 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012695766959774816 + "value": 0.013000637484058403 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -12070,13 +12971,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 92.40843014643502 + "value": 86.84572628252825 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 98.94098055334575 + "value": 98.69974442395062 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -12088,7 +12989,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -12100,7 +13001,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3393716224.0 + "value": 3461873664.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -12112,13 +13013,13 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2235564032.0 + "value": 2218786816.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 38797312.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -12130,13 +13031,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 353976320.0 + "value": 352927744.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.188289540631859 + "value": 7.489273183713623 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -12148,13 +13049,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.519488613786926 + "value": 17.984042223376264 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.125446522830666 + "value": 9.297170265624814 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -12166,7 +13067,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.102582725431247 + "value": 23.644940663651642 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -12185,30 +13086,47 @@ "time" ], "times": { - "compilation": 13466.424, - "data": 63757.121, - "framework": 850635.391, - "kernel_overhead": 336957.182, - "profiling_overhead": 53306.218, - "profiling_runs": 396614.87, + "compilation_time": 15182.617, + "data": 58822.988, + "framework": 859840.7960000001, + "kernel_overhead": 346499.308, + "profiling_overhead": 48404.961, + "profiling_runs": 406113.539, "runtimes": [ - 9523.776 + 9336.512 ], - "search_algorithm": 22.312, - "validation": 20.607 + "search_algorithm": 26.137, + "validation": 17.338 }, - "timestamp": "2026-01-27 09:25:19 UTC" + "timestamp": "2026-03-13 09:40:4 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 23 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -12216,61 +13134,61 @@ { "name": "time", "unit": "", - "value": 9734.624 + "value": 3461.92 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.089535543231662 + "value": 14.05185274419329 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103124.0 + "value": 8648.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2176436.0 + "value": 1839020.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 76.94170572596197 + "value": 1.8090853378049891 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 123082181.0 + "value": 67242.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138416228.0 + "value": 2100872.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.40818227241852506 + "value": 6.059403960102685 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012532737032885523 + "value": 0.2839316422093916 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -12288,25 +13206,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.32791942052192 + "value": 98.30881807718194 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 98.1036154947303 + "value": 99.95083766968558 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -12318,7 +13236,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -12330,43 +13248,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3393716224.0 + "value": 4945084416.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2235564032.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 1665138688.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 353976320.0 + "value": 484966400.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.157666155515047 + "value": 41.37960786170656 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -12378,13 +13296,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.442133546871013 + "value": 48.48150157301422 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.085154277892896 + "value": 3.3615103629726653 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -12396,7 +13314,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.000591033490107 + "value": 87.58930879836447 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -12415,30 +13333,47 @@ "time" ], "times": { - "compilation": 13973.063, - "data": 62627.654, - "framework": 867696.3089999999, - "kernel_overhead": 345764.783, - "profiling_overhead": 52456.835, - "profiling_runs": 406847.037, + "compilation_time": 15450.728, + "data": 57292.503, + "framework": 1351847.705, + "kernel_overhead": 609429.189, + "profiling_overhead": 46959.469, + "profiling_runs": 638166.544, "runtimes": [ - 9734.624 + 3461.92 ], - "search_algorithm": 22.146, - "validation": 16.66 + "search_algorithm": 25.87, + "validation": 17.871 }, - "timestamp": "2026-01-27 09:25:19 UTC" + "timestamp": "2026-03-13 09:40:5 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -12446,61 +13381,61 @@ { "name": "time", "unit": "", - "value": 8610.784 + "value": 4115.776 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.919931376454635 + "value": 11.764146304638787 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2116484.0 + "value": 4924.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2200832.0 + "value": 1840176.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 61.23147621858555 + "value": 1.5221692651724867 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 48094637.0 + "value": 73101.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138420000.0 + "value": 2102870.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.45059930821840216 + "value": 2.539780593394104 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.013965839474813516 + "value": 0.11901919581650626 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -12518,25 +13453,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 75.9609686709972 + "value": 98.97656701612499 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 98.50397032234108 + "value": 99.9656032780645 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -12548,7 +13483,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -12560,43 +13495,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3393716224.0 + "value": 5463080960.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2235564032.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 1369440256.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 353976320.0 + "value": 596246528.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.942938322994504 + "value": 40.96704260326837 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -12608,13 +13543,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 19.357621931936166 + "value": 40.639197390424 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.082882419869582 + "value": 1.488251857559473 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -12626,7 +13561,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 25.526506933645955 + "value": 90.26797435333168 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -12645,30 +13580,47 @@ "time" ], "times": { - "compilation": 13179.006, - "data": 64160.457, - "framework": 858224.015, - "kernel_overhead": 342555.254, - "profiling_overhead": 53899.625, - "profiling_runs": 397608.679, + "compilation_time": 14668.051, + "data": 57190.325, + "framework": 1940139.5839999998, + "kernel_overhead": 901936.975, + "profiling_overhead": 47184.013, + "profiling_runs": 933828.271, "runtimes": [ - 8610.784 + 4115.776 ], - "search_algorithm": 32.498, - "validation": 13.262 + "search_algorithm": 30.692, + "validation": 21.156 }, - "timestamp": "2026-01-27 09:25:20 UTC" + "timestamp": "2026-03-13 09:40:6 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -12676,61 +13628,61 @@ { "name": "time", "unit": "", - "value": 3487.04 + "value": 8468.192 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 29.85301702529693 + "value": 6.182997552635893 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097328.0 + "value": 24488.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1866312.0 + "value": 2003092.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.5031290049981583 + "value": 55.791246828230165 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2140357.0 + "value": 20951393.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098982.0 + "value": 138423538.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.375503953595346 + "value": 0.6366516220328106 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.28119387256858003 + "value": 0.029767177215260704 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -12748,25 +13700,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.35495715796856 + "value": 90.31836536358495 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96311107478337 + "value": 99.86633219006991 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -12778,7 +13730,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -12790,7 +13742,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4680843264.0 + "value": 1958739968.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -12802,31 +13754,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 1178599424.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1933574144.0 + "value": 143654912.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 486146048.0 + "value": 266371072.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.570246381220834 + "value": 9.036016762294514 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -12838,13 +13790,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.00813060176659 + "value": 20.348258717404438 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.5162205030590763 + "value": 5.598751849246778 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -12856,7 +13808,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 86.9450724196685 + "value": 20.192081444624687 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -12875,30 +13827,47 @@ "time" ], "times": { - "compilation": 13963.879, - "data": 63861.373, - "framework": 1436027.9610000001, - "kernel_overhead": 645053.407, - "profiling_overhead": 53277.428, - "profiling_runs": 673835.753, + "compilation_time": 14839.361, + "data": 58465.931, + "framework": 547386.1529999999, + "kernel_overhead": 193680.853, + "profiling_overhead": 48380.678, + "profiling_runs": 246858.691, "runtimes": [ - 3487.04 + 8468.192 ], - "search_algorithm": 32.068, - "validation": 14.139 + "search_algorithm": 25.264, + "validation": 15.059 }, - "timestamp": "2026-01-27 09:25:20 UTC" + "timestamp": "2026-03-13 09:40:7 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -12906,61 +13875,61 @@ { "name": "time", "unit": "", - "value": 3480.96 + "value": 9451.104 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 29.664719815659407 + "value": 5.889228285100796 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2107720.0 + "value": 7204.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1871852.0 + "value": 2197512.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.505854803404582 + "value": 81.77222747069209 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2153279.0 + "value": 133094342.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2104793.0 + "value": 138416387.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.375396647394928 + "value": 0.2750836473069054 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2811768859007743 + "value": 0.012764518361938332 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -12978,25 +13947,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.84719217712846 + "value": 98.96076254901725 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96128844508415 + "value": 98.93603070140904 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -13008,7 +13977,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -13020,7 +13989,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4680843264.0 + "value": 1817706496.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -13032,31 +14001,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 1145044992.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1933574144.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 486146048.0 + "value": 252264448.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.5680981508636 + "value": 7.265474455786407 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -13068,13 +14037,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.00610577039708 + "value": 17.615243182837332 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.5160721999802544 + "value": 4.702702251082183 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -13086,7 +14055,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 86.94143664745035 + "value": 16.554364724951643 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -13105,30 +14074,47 @@ "time" ], "times": { - "compilation": 13296.517, - "data": 63161.365, - "framework": 1429795.178, - "kernel_overhead": 642451.111, - "profiling_overhead": 52748.286, - "profiling_runs": 671434.416, + "compilation_time": 14409.408, + "data": 58383.301, + "framework": 533165.28, + "kernel_overhead": 183449.556, + "profiling_overhead": 48296.547, + "profiling_runs": 243035.876, "runtimes": [ - 3480.96 + 9451.104 ], - "search_algorithm": 24.954, - "validation": 16.672 + "search_algorithm": 22.1, + "validation": 14.606 }, - "timestamp": "2026-01-27 09:25:21 UTC" + "timestamp": "2026-03-13 09:40:7 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -13136,61 +14122,61 @@ { "name": "time", "unit": "", - "value": 3488.672 + "value": 2576.864 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 29.634384279433885 + "value": 18.790507597316136 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097348.0 + "value": 2892.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837112.0 + "value": 1839624.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.515352898225868 + "value": 2.410230107998907 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2155197.0 + "value": 48351.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098996.0 + "value": 2103421.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.376538574298227 + "value": 4.066894538465361 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.28120114400442625 + "value": 0.1905697738472762 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -13220,13 +14206,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.43270982332766 + "value": 98.72867425644412 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95019144928148 + "value": 99.95653446082463 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -13238,7 +14224,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -13250,43 +14236,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4680843264.0 + "value": 2776629248.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1933574144.0 + "value": 1642070016.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 486146048.0 + "value": 378208256.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.5765944587132 + "value": 45.248249348924325 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -13298,13 +14284,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.01557777349685 + "value": 65.07610181839995 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.516765950207289 + "value": 2.38315802557617 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -13316,7 +14302,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 86.95855800203792 + "value": 91.6888525085137 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -13335,30 +14321,47 @@ "time" ], "times": { - "compilation": 13664.801, - "data": 64601.184, - "framework": 1434993.831, - "kernel_overhead": 643658.882, - "profiling_overhead": 54509.053, - "profiling_runs": 672224.712, + "compilation_time": 14945.79, + "data": 58191.565, + "framework": 1075123.785, + "kernel_overhead": 472188.01, + "profiling_overhead": 48207.024, + "profiling_runs": 496537.186, "runtimes": [ - 3488.672 + 2576.864 ], - "search_algorithm": 26.141, - "validation": 14.403 + "search_algorithm": 26.198, + "validation": 15.752 }, - "timestamp": "2026-01-27 09:25:22 UTC" + "timestamp": "2026-03-13 09:40:7 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -13366,61 +14369,61 @@ { "name": "time", "unit": "", - "value": 3534.72 + "value": 3344.128 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 29.412015199161424 + "value": 14.651892212338943 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2108924.0 + "value": 2928.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1841792.0 + "value": 1839844.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.513222429859306 + "value": 1.9126783579369546 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2168478.0 + "value": 57133.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102004.0 + "value": 2100434.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.376710017234346 + "value": 1.5934887398812632 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2812317460702538 + "value": 0.0746592940613781 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -13450,13 +14453,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.11717151129264 + "value": 98.68769386326086 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.962152374658 + "value": 99.94808921907156 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -13468,7 +14471,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -13480,43 +14483,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4680843264.0 + "value": 3234856960.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1933574144.0 + "value": 1294991360.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 486146048.0 + "value": 500957184.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.57557262122285 + "value": 45.37259971070455 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -13528,13 +14531,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.01505722162164 + "value": 50.993882735321094 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.5167278238492408 + "value": 1.033323307380774 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -13546,7 +14549,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 86.95764816923604 + "value": 95.16607687860301 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -13565,30 +14568,47 @@ "time" ], "times": { - "compilation": 14057.66, - "data": 62223.914, - "framework": 1431120.855, - "kernel_overhead": 643833.453, - "profiling_overhead": 52088.202, - "profiling_runs": 672975.286, + "compilation_time": 16413.005, + "data": 57255.037, + "framework": 1832023.54, + "kernel_overhead": 849708.645, + "profiling_overhead": 47064.459, + "profiling_runs": 877995.399, "runtimes": [ - 3534.72 + 3344.128 ], - "search_algorithm": 25.59, - "validation": 14.5 + "search_algorithm": 29.642, + "validation": 17.266 }, - "timestamp": "2026-01-27 09:25:23 UTC" + "timestamp": "2026-03-13 09:40:8 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 56 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -13596,61 +14616,61 @@ { "name": "time", "unit": "", - "value": 3476.96 + "value": 5483.008 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 29.699699717836232 + "value": 8.787379741405724 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097416.0 + "value": 492.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1836668.0 + "value": 1835044.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.5223575121466086 + "value": 1.1499018329281727 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2153722.0 + "value": 89759.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098946.0 + "value": 2099490.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.390624462355467 + "value": 0.4767274092027408 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2818500538446453 + "value": 0.022341524086119304 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -13680,13 +14700,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.68227816281507 + "value": 73.89633961111531 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95747853525361 + "value": 99.97266320076078 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -13698,7 +14718,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -13710,43 +14730,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4680843264.0 + "value": 4706533376.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1933574144.0 + "value": 1121452032.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 486146048.0 + "value": 852869120.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.66322784706977 + "value": 42.99122904613712 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -13758,13 +14778,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.12287173942776 + "value": 30.51196854783408 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.5246243949776193 + "value": 0.36873594802680343 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -13776,7 +14796,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.15290667970875 + "value": 96.94264797541064 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -13795,30 +14815,47 @@ "time" ], "times": { - "compilation": 13119.314, - "data": 60890.935, - "framework": 1429569.8199999998, - "kernel_overhead": 644494.467, - "profiling_overhead": 50711.833, - "profiling_runs": 673472.585, + "compilation_time": 15016.076, + "data": 57641.365, + "framework": 3056821.1859999998, + "kernel_overhead": 1456557.317, + "profiling_overhead": 47627.972, + "profiling_runs": 1494994.532, "runtimes": [ - 3476.96 + 5483.008 ], - "search_algorithm": 29.613, - "validation": 14.748 + "search_algorithm": 26.033, + "validation": 14.837 }, - "timestamp": "2026-01-27 09:25:23 UTC" + "timestamp": "2026-03-13 09:40:10 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -13826,61 +14863,61 @@ { "name": "time", "unit": "", - "value": 4824.8 + "value": 3279.456 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 22.668488010851494 + "value": 14.76675380715869 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2107912.0 + "value": 4212.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1870832.0 + "value": 1837696.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.782611938977219 + "value": 1.8986124633659907 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2169613.0 + "value": 58717.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2104722.0 + "value": 2099784.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.6593910167979495 + "value": 1.5875429860265615 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.11076771137439176 + "value": 0.07439297349647399 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -13910,13 +14947,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.90944692341333 + "value": 98.73542220225696 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95531865300585 + "value": 99.96446536800639 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -13928,7 +14965,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -13940,43 +14977,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5192548352.0 + "value": 2295332864.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1642070016.0 + "value": 1429209088.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 597360640.0 + "value": 471597056.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.4667253154901 + "value": 42.0283455890491 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -13988,13 +15025,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.825613142586626 + "value": 50.80365613250557 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5329716263841258 + "value": 1.0294686179194243 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -14006,7 +15043,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.175438163473 + "value": 89.25441863024241 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -14025,30 +15062,47 @@ "time" ], "times": { - "compilation": 13252.476, - "data": 61599.321, - "framework": 2069413.264, - "kernel_overhead": 961267.714, - "profiling_overhead": 51143.373, - "profiling_runs": 995402.856, + "compilation_time": 14823.022, + "data": 57154.503, + "framework": 1655243.3960000002, + "kernel_overhead": 761634.685, + "profiling_overhead": 47019.942, + "profiling_runs": 789434.266, "runtimes": [ - 4824.8 + 3279.456 ], - "search_algorithm": 38.046, - "validation": 17.62 + "search_algorithm": 25.079, + "validation": 16.507 }, - "timestamp": "2026-01-27 09:25:25 UTC" + "timestamp": "2026-03-13 09:40:11 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -14056,61 +15110,61 @@ { "name": "time", "unit": "", - "value": 4424.672 + "value": 6277.28 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 23.533760122149467 + "value": 7.767065246516614 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097552.0 + "value": 13844.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1867928.0 + "value": 1841304.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.771643171654734 + "value": 1.028680540143125 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2157641.0 + "value": 117244.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099283.0 + "value": 2111279.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.659349185150098 + "value": 0.41932640369727187 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.11078440370529113 + "value": 0.019651985848523106 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -14140,13 +15194,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.82184437042798 + "value": 82.01021625195445 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97590086481797 + "value": 99.97501890432119 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -14158,7 +15212,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -14170,43 +15224,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5192548352.0 + "value": 6215958528.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1642070016.0 + "value": 1355284480.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 597360640.0 + "value": 936706048.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.46500766342329 + "value": 42.8453442859236 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -14218,13 +15272,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.82352494047872 + "value": 26.83821582555742 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5328869970994796 + "value": 0.3243387898840557 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -14236,7 +15290,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.17081597035677 + "value": 93.65248504664707 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -14255,30 +15309,47 @@ "time" ], "times": { - "compilation": 13685.975, - "data": 63637.703, - "framework": 2072399.37, - "kernel_overhead": 961256.995, - "profiling_overhead": 53274.698, - "profiling_runs": 994229.974, + "compilation_time": 14433.998, + "data": 58524.528, + "framework": 3545812.423, + "kernel_overhead": 1698106.457, + "profiling_overhead": 48611.943, + "profiling_runs": 1740569.495, "runtimes": [ - 4424.672 + 6277.28 ], - "search_algorithm": 20.763, - "validation": 11.442 + "search_algorithm": 32.669, + "validation": 17.029 }, - "timestamp": "2026-01-27 09:25:26 UTC" + "timestamp": "2026-03-13 09:40:13 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 16, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -14286,61 +15357,61 @@ { "name": "time", "unit": "", - "value": 4582.72 + "value": 5897.888 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 22.98985856097925 + "value": 8.229411314204023 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2101668.0 + "value": 17000.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1840964.0 + "value": 1839808.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.758738718226112 + "value": 1.079994925131695 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2178317.0 + "value": 115550.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101741.0 + "value": 2106056.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.6596841450343 + "value": 0.4408732139948083 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.1107943925486443 + "value": 0.020662359788787905 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -14370,13 +15441,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 99.03378122918278 + "value": 82.01511674321664 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9745469257261 + "value": 99.97426768598284 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -14388,7 +15459,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -14400,43 +15471,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5192548352.0 + "value": 5813305344.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 101187584.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1642070016.0 + "value": 986185728.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 597360640.0 + "value": 897892352.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.46835892035542 + "value": 43.94643659146637 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -14448,13 +15519,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.82744757159691 + "value": 28.218269778748095 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5330459709192108 + "value": 0.3410166880000075 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -14466,7 +15537,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.17951985658043 + "value": 94.50001243479227 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -14485,30 +15556,47 @@ "time" ], "times": { - "compilation": 13505.677, - "data": 61396.971, - "framework": 2069337.7449999999, - "kernel_overhead": 961579.173, - "profiling_overhead": 51072.823, - "profiling_runs": 995288.778, + "compilation_time": 15395.454, + "data": 58685.251, + "framework": 3454373.142, + "kernel_overhead": 1653085.528, + "profiling_overhead": 48698.623, + "profiling_runs": 1693903.74, "runtimes": [ - 4582.72 + 5897.888 ], - "search_algorithm": 24.484, - "validation": 11.333 + "search_algorithm": 22.937, + "validation": 14.802 }, - "timestamp": "2026-01-27 09:25:27 UTC" + "timestamp": "2026-03-13 09:40:15 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 256 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 44 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 1 }, "correctness": 1, "invalidity": "correct", @@ -14516,61 +15604,61 @@ { "name": "time", "unit": "", - "value": 4504.448 + "value": 3685.472 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 23.263110046220774 + "value": 13.08290244035894 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104908.0 + "value": 512.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837776.0 + "value": 1835800.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.7894021750464866 + "value": 1.6890298963247599 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2178263.0 + "value": 61321.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101455.0 + "value": 2099466.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.6595896563983126 + "value": 22.572996507534047 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.11078930300788721 + "value": 1.0577727558631234 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -14600,13 +15688,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.92490397838861 + "value": 72.43716671246821 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96697672058704 + "value": 99.9555866838285 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -14618,7 +15706,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 8589934592.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -14630,43 +15718,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5192548352.0 + "value": 587202560.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 117440512.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1642070016.0 + "value": 234881024.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 50331648.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 597360640.0 + "value": 341311488.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.46924492879042 + "value": 40.12449724812488 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -14678,13 +15766,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.82857432915749 + "value": 45.151690946750215 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5330916354101911 + "value": 11.993417907730528 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -14696,7 +15784,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.18205265879106 + "value": 57.41037078258023 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -14715,30 +15803,47 @@ "time" ], "times": { - "compilation": 13791.908, - "data": 59578.008, - "framework": 2067693.9349999998, - "kernel_overhead": 962573.737, - "profiling_overhead": 49558.49, - "profiling_runs": 995983.7, + "compilation_time": 22107.845, + "data": 58026.889, + "framework": 245074.51, + "kernel_overhead": 54245.581, + "profiling_overhead": 48129.576, + "profiling_runs": 84672.464, "runtimes": [ - 4504.448 + 3685.472 ], - "search_algorithm": 31.621, - "validation": 13.359 + "search_algorithm": 23.766, + "validation": 18.893 }, - "timestamp": "2026-01-27 09:25:28 UTC" + "timestamp": "2026-03-13 09:40:15 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -14746,61 +15851,61 @@ { "name": "time", "unit": "", - "value": 4449.952 + "value": 2134.72 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 22.903741383144318 + "value": 22.850668393947103 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110244.0 + "value": 892.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1844884.0 + "value": 1839212.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.7438412038273228 + "value": 2.933742597674666 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2185983.0 + "value": 36054.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103306.0 + "value": 2100201.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.6412548109810827 + "value": 19.907506338501875 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.11003028877399144 + "value": 0.9327310078123242 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -14830,13 +15935,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.52074699954228 + "value": 94.30415073401413 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96914534300026 + "value": 99.9333528901577 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -14848,7 +15953,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 6442450944.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -14860,43 +15965,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5192548352.0 + "value": 704643072.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 142606336.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1642070016.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 597360640.0 + "value": 256901120.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.239438941770835 + "value": 50.47889305982858 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -14908,13 +16013,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.5685968965678 + "value": 79.6461278422932 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5225554406323862 + "value": 10.733560197496544 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -14926,7 +16031,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 83.60352901950677 + "value": 76.22488065503386 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -14945,260 +16050,47 @@ "time" ], "times": { - "compilation": 13489.687, - "data": 62959.229, - "framework": 2071432.8159999999, - "kernel_overhead": 960983.531, - "profiling_overhead": 52887.716, - "profiling_runs": 994602.34, + "compilation_time": 14637.747, + "data": 56816.215, + "framework": 246647.953, + "kernel_overhead": 60282.096, + "profiling_overhead": 46977.306, + "profiling_runs": 82572.336, "runtimes": [ - 4449.952 + 2134.72 ], - "search_algorithm": 23.114, - "validation": 12.078 + "search_algorithm": 33.888, + "validation": 15.43 }, - "timestamp": "2026-01-27 09:25:29 UTC" + "timestamp": "2026-03-13 09:40:15 UTC" }, { - "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 39 }, - "correctness": 1, - "invalidity": "correct", - "measurements": [ - { - "name": "time", - "unit": "", - "value": 8418.336 - }, - { - "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 12.906943179275737 - }, - { - "name": "dram__sectors_read.sum", - "type": "Double", - "unit": "", - "value": 2111436.0 - }, - { - "name": "dram__sectors_write.sum", - "type": "Double", - "unit": "", - "value": 2023676.0 - }, - { - "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 56.640745962172765 - }, - { - "name": "lts__t_sectors_op_read.sum", - "type": "Double", - "unit": "", - "value": 25843085.0 - }, - { - "name": "lts__t_sectors_op_write.sum", - "type": "Double", - "unit": "", - "value": 138424761.0 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.7789486442692485 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", - "type": "Double", - "unit": "", - "value": 2621440.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 0.029386342711573595 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", - "type": "Double", - "unit": "", - "value": 136314880.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", - "type": "Double", - "unit": "", - "value": 136314880.0 - }, - { - "name": "sm__warps_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 90.6170714229253 - }, - { - "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 99.95266517663146 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", - "type": "Double", - "unit": "", - "value": 4647288832.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", - "type": "Double", - "unit": "", - "value": 1888485376.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", - "type": "Double", - "unit": "", - "value": 536870912.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", - "type": "Double", - "unit": "", - "value": 1195376640.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", - "type": "Double", - "unit": "", - "value": 213909504.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", - "type": "Double", - "unit": "", - "value": 3145728.0 - }, - { - "name": "smsp__inst_executed.sum", - "type": "Double", - "unit": "", - "value": 267419648.0 - }, - { - "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 8.78799168027154 - }, - { - "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 20.07057699660463 - }, - { - "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 5.600749391386498 - }, - { - "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 19.99491575655513 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.ratio", - "type": "Double", - "unit": "", - "value": 32.0 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.pct", - "type": "Double", - "unit": "", - "value": 100.0 - } - ], - "objectives": [ - "time" - ], - "times": { - "compilation": 13689.486, - "data": 63571.239, - "framework": 556157.3539999999, - "kernel_overhead": 192870.216, - "profiling_overhead": 53365.947, - "profiling_runs": 246349.952, - "runtimes": [ - 8418.336 - ], - "search_algorithm": 22.646, - "validation": 13.233 - }, - "timestamp": "2026-01-27 09:25:29 UTC" - }, - { "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -15206,61 +16098,61 @@ { "name": "time", "unit": "", - "value": 8671.296 + "value": 1876.416 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.862577667618485 + "value": 25.737377726607402 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109124.0 + "value": 5992.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2019216.0 + "value": 1839588.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 56.89212500146618 + "value": 3.340883091060524 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 26882997.0 + "value": 39598.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138417484.0 + "value": 2104158.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.7795377872975848 + "value": 11.290846772321125 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.029251471553979426 + "value": 0.528871553463349 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -15278,25 +16170,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 91.48382983354784 + "value": 95.87348230235104 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.4992650145235 + "value": 99.90100907868191 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -15308,7 +16200,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 5368709120.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -15320,43 +16212,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1888485376.0 + "value": 390070272.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 71303168.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1195376640.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 88080384.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 267419648.0 + "value": 197394432.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.787985790455073 + "value": 45.83718702765216 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -15368,13 +16260,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.06949958669058 + "value": 90.35018360357333 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.600448737203938 + "value": 6.264514683450885 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -15386,7 +16278,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 19.993861174159633 + "value": 66.44057444073563 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -15405,30 +16297,47 @@ "time" ], "times": { - "compilation": 13864.809, - "data": 63812.673, - "framework": 558104.856, - "kernel_overhead": 193812.824, - "profiling_overhead": 53309.338, - "profiling_runs": 247170.021, + "compilation_time": 15381.889, + "data": 57416.347, + "framework": 196555.738, + "kernel_overhead": 35105.083, + "profiling_overhead": 47813.69, + "profiling_runs": 56220.618, "runtimes": [ - 8671.296 + 1876.416 ], - "search_algorithm": 23.931, - "validation": 15.227 + "search_algorithm": 23.707, + "validation": 15.802 }, - "timestamp": "2026-01-27 09:25:30 UTC" + "timestamp": "2026-03-13 09:40:15 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -15436,61 +16345,61 @@ { "name": "time", "unit": "", - "value": 8326.815 + "value": 1840.096 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 13.001151387520393 + "value": 26.559723353448593 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2117676.0 + "value": 200.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2003352.0 + "value": 1836356.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 55.84609618413121 + "value": 3.4084468916351294 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 21827418.0 + "value": 30798.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138420048.0 + "value": 2098903.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.7939141542342815 + "value": 5.797118836206456 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.02979428269827964 + "value": 0.2715511397556902 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -15508,25 +16417,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.57818370070251 + "value": 98.19979726666077 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.906296853619 + "value": 99.91765816091504 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -15538,7 +16447,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4831838208.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -15550,43 +16459,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1888485376.0 + "value": 362807296.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 69206016.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1195376640.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 127926272.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 267419648.0 + "value": 173801472.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.91424762616427 + "value": 42.624645236533055 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -15598,13 +16507,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.358640341919674 + "value": 92.7658408026351 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.681134255569869 + "value": 3.397186552830875 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -15616,7 +16525,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.28189097304252 + "value": 60.06348851773476 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -15635,30 +16544,47 @@ "time" ], "times": { - "compilation": 13999.055, - "data": 65423.447, - "framework": 563883.262, - "kernel_overhead": 195217.318, - "profiling_overhead": 54864.868, - "profiling_runs": 248377.629, + "compilation_time": 15126.398, + "data": 56920.526, + "framework": 184441.12399999998, + "kernel_overhead": 29706.764, + "profiling_overhead": 47306.241, + "profiling_runs": 50507.593, "runtimes": [ - 8326.815 + 1840.096 ], - "search_algorithm": 22.594, - "validation": 12.969 + "search_algorithm": 27.424, + "validation": 14.372 }, - "timestamp": "2026-01-27 09:25:30 UTC" + "timestamp": "2026-03-13 09:40:15 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -15666,61 +16592,61 @@ { "name": "time", "unit": "", - "value": 8165.92 + "value": 1767.04 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 13.041957402909665 + "value": 27.1134111403344 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2113164.0 + "value": 7272.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1997060.0 + "value": 1840084.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 56.36488299232829 + "value": 3.5414489971724055 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 23507300.0 + "value": 37526.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138418015.0 + "value": 2100594.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.7892185111601604 + "value": 2.995311185205292 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.02961601848687404 + "value": 0.14028732872513203 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -15738,25 +16664,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 92.06266513968733 + "value": 98.24925455317026 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.57046273944314 + "value": 99.89722944227796 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -15768,7 +16694,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4563402752.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -15780,25 +16706,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1888485376.0 + "value": 209715200.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 67108864.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1195376640.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 160432128.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -15810,13 +16736,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 267419648.0 + "value": 163381248.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.890829760439175 + "value": 39.53129300756368 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -15828,13 +16754,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.30508653281945 + "value": 95.8680071619942 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.666189918704256 + "value": 1.9426378404408005 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -15846,7 +16772,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.228560310184555 + "value": 58.350653871482905 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -15865,30 +16791,47 @@ "time" ], "times": { - "compilation": 14282.079, - "data": 65437.866, - "framework": 562632.094, - "kernel_overhead": 194531.89, - "profiling_overhead": 55411.171, - "profiling_runs": 247251.167, + "compilation_time": 15106.288, + "data": 57139.554, + "framework": 191262.272, + "kernel_overhead": 33291.346, + "profiling_overhead": 47043.482, + "profiling_runs": 53787.89, "runtimes": [ - 8165.92 + 1767.04 ], - "search_algorithm": 23.094, - "validation": 19.183 + "search_algorithm": 28.101, + "validation": 17.346 }, - "timestamp": "2026-01-27 09:25:30 UTC" + "timestamp": "2026-03-13 09:40:15 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -15896,61 +16839,61 @@ { "name": "time", "unit": "", - "value": 8107.392 + "value": 1817.376 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 13.004361538523964 + "value": 27.20618573813408 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103200.0 + "value": 1500.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1999004.0 + "value": 1834208.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 53.678873878419395 + "value": 3.495134394639864 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 14869319.0 + "value": 29967.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138415527.0 + "value": 2099415.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.7951172067805264 + "value": 1.4940856379162875 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.029762467750363134 + "value": 0.06999054148004706 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -15968,25 +16911,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.2805008214013 + "value": 81.71369419889523 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.78719770465203 + "value": 99.92554996799505 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -15998,7 +16941,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4429185024.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -16010,43 +16953,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1888485376.0 + "value": 138412032.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 33554432.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1195376640.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 82837504.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 267419648.0 + "value": 150192128.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.915485070868973 + "value": 37.928541787347484 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -16058,13 +17001,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.36117369589923 + "value": 95.63161707027325 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.6818411949250045 + "value": 1.1557043566842105 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -16076,7 +17019,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.28443601497339 + "value": 53.50795402649145 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -16095,30 +17038,47 @@ "time" ], "times": { - "compilation": 14206.051, - "data": 67386.837, - "framework": 560991.885, - "kernel_overhead": 192849.082, - "profiling_overhead": 55803.947, - "profiling_runs": 244952.019, + "compilation_time": 15312.68, + "data": 56544.177, + "framework": 169856.877, + "kernel_overhead": 22879.209, + "profiling_overhead": 46980.674, + "profiling_runs": 43452.817, "runtimes": [ - 8107.392 + 1817.376 ], - "search_algorithm": 21.773, - "validation": 18.124 + "search_algorithm": 24.654, + "validation": 15.354 }, - "timestamp": "2026-01-27 09:25:31 UTC" + "timestamp": "2026-03-13 09:40:15 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 19 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -16126,61 +17086,61 @@ { "name": "time", "unit": "", - "value": 10212.992 + "value": 5190.304 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.249364748789285 + "value": 9.266893672545894 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109072.0 + "value": 5048.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2225976.0 + "value": 1840336.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 79.64526026618651 + "value": 1.2080621554872668 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 135593699.0 + "value": 90784.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138424062.0 + "value": 2101742.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.39969360747014643 + "value": 7.95471248036548 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01236072655964445 + "value": 0.3727828476204605 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -16198,25 +17158,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 99.1325867899026 + "value": 97.1619163672518 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 98.97247939758196 + "value": 99.96924453339801 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -16228,7 +17188,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 6979321856.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -16240,43 +17200,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1783103488.0 + "value": 8212447232.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1161822208.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 3321888768.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 253313024.0 + "value": 715390976.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.99696324806149 + "value": 36.263770950637145 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -16288,13 +17248,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.051721952227396 + "value": 31.820589567743024 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.618868531737377 + "value": 4.288321640965368 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -16306,7 +17266,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 16.091353147246224 + "value": 84.80359262102318 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -16325,30 +17285,47 @@ "time" ], "times": { - "compilation": 13933.001, - "data": 64381.824, - "framework": 549595.845, - "kernel_overhead": 184503.571, - "profiling_overhead": 53987.671, - "profiling_runs": 246722.779, + "compilation_time": 15021.553, + "data": 57555.909, + "framework": 2097297.394, + "kernel_overhead": 976845.126, + "profiling_overhead": 47941.592, + "profiling_runs": 1014954.767, "runtimes": [ - 10212.992 + 5190.304 ], - "search_algorithm": 35.348, - "validation": 14.827 + "search_algorithm": 23.518, + "validation": 14.73 }, - "timestamp": "2026-01-27 09:25:31 UTC" + "timestamp": "2026-03-13 09:40:17 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 22 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -16356,61 +17333,61 @@ { "name": "time", "unit": "", - "value": 9834.016 + "value": 5794.528 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.379811489014262 + "value": 8.26871203417209 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2122720.0 + "value": 18968.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2230260.0 + "value": 1841860.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 79.37164892868776 + "value": 1.1119226698252491 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 135749324.0 + "value": 115403.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138425902.0 + "value": 2103678.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.3992180994080388 + "value": 3.6246085756153956 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012418207890097567 + "value": 0.16987257724991048 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -16428,25 +17405,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.89669425008164 + "value": 98.52870575941368 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.34791687695493 + "value": 99.9756970488545 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -16458,7 +17435,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -16470,43 +17447,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1783103488.0 + "value": 10049552384.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1161822208.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 1392508928.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 253313024.0 + "value": 786956288.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.003400042648112 + "value": 33.82731188552912 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -16518,13 +17495,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.066279500970744 + "value": 28.99863404118194 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.622811793536875 + "value": 2.010647477464764 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -16536,7 +17513,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 16.10510504254778 + "value": 85.01398823736768 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -16555,30 +17532,47 @@ "time" ], "times": { - "compilation": 13895.54, - "data": 65456.328, - "framework": 551580.54, - "kernel_overhead": 184573.557, - "profiling_overhead": 55005.046, - "profiling_runs": 246545.609, + "compilation_time": 15314.807, + "data": 58277.336, + "framework": 2688842.41, + "kernel_overhead": 1271021.768, + "profiling_overhead": 47982.609, + "profiling_runs": 1311560.697, "runtimes": [ - 9834.016 + 5794.528 ], - "search_algorithm": 23.477, - "validation": 19.737 + "search_algorithm": 25.571, + "validation": 17.514 }, - "timestamp": "2026-01-27 09:25:31 UTC" + "timestamp": "2026-03-13 09:40:18 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 32, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -16586,61 +17580,61 @@ { "name": "time", "unit": "", - "value": 9505.984 + "value": 8054.335 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.616288409152348 + "value": 6.207643226186194 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2111168.0 + "value": 29324.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2179060.0 + "value": 1914596.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 81.78331294724626 + "value": 49.158568184791925 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 135011180.0 + "value": 918706.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138416810.0 + "value": 138421298.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.4081965575175544 + "value": 1.2841558792481054 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012785473038987494 + "value": 0.060675301589652904 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -16670,13 +17664,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.93840071186223 + "value": 97.5463892602469 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 100.1961867264232 + "value": 100.65379027753345 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -16688,7 +17682,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -16700,43 +17694,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1783103488.0 + "value": 3982491648.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1161822208.0 + "value": 2319450112.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 155189248.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 253313024.0 + "value": 395247616.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.14856812012327 + "value": 10.760639964410403 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -16748,13 +17742,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.422252375958674 + "value": 20.575979191142526 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.71923559841947 + "value": 11.141973106922395 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -16766,7 +17760,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 16.441015018549898 + "value": 30.29665175114155 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -16785,30 +17779,47 @@ "time" ], "times": { - "compilation": 14701.466, - "data": 69260.482, - "framework": 556243.283, - "kernel_overhead": 184748.631, - "profiling_overhead": 57847.055, - "profiling_runs": 244387.115, + "compilation_time": 14428.673, + "data": 57988.75, + "framework": 905670.345, + "kernel_overhead": 373666.562, + "profiling_overhead": 47968.085, + "profiling_runs": 426046.948, "runtimes": [ - 9505.984 + 8054.335 ], - "search_algorithm": 23.5, - "validation": 22.802 + "search_algorithm": 25.182, + "validation": 15.502 }, - "timestamp": "2026-01-27 09:25:32 UTC" + "timestamp": "2026-03-13 09:40:19 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -16816,61 +17827,61 @@ { "name": "time", "unit": "", - "value": 9701.184 + "value": 8318.847 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.413423315309663 + "value": 6.333929797286383 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102752.0 + "value": 10184.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2182384.0 + "value": 1997540.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 81.78778891869305 + "value": 53.795495328351706 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 134927698.0 + "value": 15905517.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138458227.0 + "value": 138420378.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.41065663878111375 + "value": 0.6362119932485598 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012759462125538655 + "value": 0.029618766130121398 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -16900,13 +17911,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.82496708000886 + "value": 90.15398668689765 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.40688119775713 + "value": 99.11602510168737 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -16918,7 +17929,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -16930,43 +17941,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1783103488.0 + "value": 3635412992.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1161822208.0 + "value": 2252341248.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 77594624.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 253313024.0 + "value": 367034368.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.191295584979837 + "value": 9.218365787293479 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -16978,13 +17989,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.524862208226253 + "value": 20.400075894977874 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.747029936530036 + "value": 10.713028137230813 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -16996,7 +18007,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 16.537860328820095 + "value": 27.893556429453987 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -17015,30 +18026,47 @@ "time" ], "times": { - "compilation": 14459.096, - "data": 63657.145, - "framework": 550678.758, - "kernel_overhead": 186534.702, - "profiling_overhead": 53250.669, - "profiling_runs": 247236.242, + "compilation_time": 14698.911, + "data": 57879.109, + "framework": 876435.503, + "kernel_overhead": 358624.247, + "profiling_overhead": 47921.999, + "profiling_runs": 412010.148, "runtimes": [ - 9701.184 + 8318.847 ], - "search_algorithm": 22.237, - "validation": 15.993 + "search_algorithm": 38.706, + "validation": 18.207 }, - "timestamp": "2026-01-27 09:25:32 UTC" + "timestamp": "2026-03-13 09:40:19 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -17046,61 +18074,61 @@ { "name": "time", "unit": "", - "value": 9250.24 + "value": 10489.632 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.991203859541066 + "value": 5.781606328392246 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2115512.0 + "value": 7372.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2170716.0 + "value": 2199296.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 77.88306820295561 + "value": 76.45174849532879 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 115350522.0 + "value": 118575942.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138418054.0 + "value": 138416578.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.4201702525875663 + "value": 0.27533385236589153 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.013055948265804996 + "value": 0.012671866050455931 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -17130,13 +18158,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 92.79038422571556 + "value": 94.71130736355947 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 98.98995876400714 + "value": 98.44000768345316 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -17148,7 +18176,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -17160,25 +18188,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1783103488.0 + "value": 3461873664.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1161822208.0 + "value": 2218786816.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 38797312.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -17190,13 +18218,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 253313024.0 + "value": 352927744.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.389569593545824 + "value": 7.31784994267429 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -17208,13 +18236,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 18.007605607833167 + "value": 17.575497525210665 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.877792583469457 + "value": 9.085965822664448 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -17226,7 +18254,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 16.993415373739136 + "value": 23.107815810898046 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -17245,30 +18273,47 @@ "time" ], "times": { - "compilation": 13424.02, - "data": 59432.892, - "framework": 537911.733, - "kernel_overhead": 185428.415, - "profiling_overhead": 48800.708, - "profiling_runs": 244249.718, + "compilation_time": 14492.104, + "data": 58611.451, + "framework": 861139.37, + "kernel_overhead": 346321.455, + "profiling_overhead": 48270.338, + "profiling_runs": 407936.126, "runtimes": [ - 9250.24 + 10489.632 ], - "search_algorithm": 22.752, - "validation": 17.742 + "search_algorithm": 25.378, + "validation": 19.483 }, - "timestamp": "2026-01-27 09:25:32 UTC" + "timestamp": "2026-03-13 09:40:20 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 23 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -17276,61 +18321,61 @@ { "name": "time", "unit": "", - "value": 2830.976 + "value": 3477.984 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 36.248233762290674 + "value": 13.967102445292422 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104636.0 + "value": 212.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1868588.0 + "value": 1836348.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.3352401732016235 + "value": 1.7994892347992841 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2140661.0 + "value": 57534.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101267.0 + "value": 2099099.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.18667743673651 + "value": 6.059674072687125 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.17438209518214118 + "value": 0.2839290671804871 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -17360,13 +18405,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.61139511228552 + "value": 97.95052779441636 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95550510565742 + "value": 99.94838560833385 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -17378,7 +18423,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -17390,43 +18435,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2373976064.0 + "value": 4945084416.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 2042626048.0 + "value": 1665138688.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 379191296.0 + "value": 484966400.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 36.84020128283194 + "value": 41.379533977945194 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -17438,13 +18483,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 59.54891805033951 + "value": 48.48225128453636 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.413359471766689 + "value": 3.3615623449239083 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -17456,7 +18501,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.11940167471322 + "value": 87.59069794504867 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -17475,30 +18520,47 @@ "time" ], "times": { - "compilation": 14273.068, - "data": 63863.8, - "framework": 1191935.057, - "kernel_overhead": 524477.157, - "profiling_overhead": 53189.967, - "profiling_runs": 550404.133, + "compilation_time": 14521.706, + "data": 57421.939, + "framework": 1364991.584, + "kernel_overhead": 615879.149, + "profiling_overhead": 47037.873, + "profiling_runs": 644652.623, "runtimes": [ - 2830.976 + 3477.984 ], - "search_algorithm": 22.701, - "validation": 14.72 + "search_algorithm": 25.837, + "validation": 16.089 }, - "timestamp": "2026-01-27 09:25:33 UTC" + "timestamp": "2026-03-13 09:40:20 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -17506,61 +18568,61 @@ { "name": "time", "unit": "", - "value": 2857.6 + "value": 4115.008 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 36.71802985180458 + "value": 11.735250252986877 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2105328.0 + "value": 2784.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1871640.0 + "value": 1837844.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.3339261804658635 + "value": 1.5190985191007014 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2140382.0 + "value": 71824.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2104297.0 + "value": 2099774.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.186691032671284 + "value": 2.5397556486484207 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.1743911901794258 + "value": 0.11901589314488117 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -17590,13 +18652,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.40412635703869 + "value": 98.8477115980538 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96257948845677 + "value": 99.96176285729769 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -17608,7 +18670,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -17620,25 +18682,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2373976064.0 + "value": 5463080960.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 2042626048.0 + "value": 1369440256.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -17650,13 +18712,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 379191296.0 + "value": 596246528.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 36.8391526792479 + "value": 40.96808555149869 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -17668,13 +18730,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 59.54780934277954 + "value": 40.63963096047015 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.4133145387942876 + "value": 1.48826773536878 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -17686,7 +18748,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.11787720782303 + "value": 90.2689634409765 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -17705,30 +18767,47 @@ "time" ], "times": { - "compilation": 14298.905, - "data": 65106.142, - "framework": 1192241.08, - "kernel_overhead": 523736.457, - "profiling_overhead": 53807.672, - "profiling_runs": 549590.809, + "compilation_time": 14393.689, + "data": 57160.139, + "framework": 1945709.062, + "kernel_overhead": 904684.044, + "profiling_overhead": 47114.965, + "profiling_runs": 936749.914, "runtimes": [ - 2857.6 + 4115.008 ], - "search_algorithm": 32.211, - "validation": 14.693 + "search_algorithm": 24.912, + "validation": 16.602 }, - "timestamp": "2026-01-27 09:25:34 UTC" + "timestamp": "2026-03-13 09:40:21 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -17736,61 +18815,61 @@ { "name": "time", "unit": "", - "value": 2801.344 + "value": 8292.8 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 36.98817766735647 + "value": 6.369514250128215 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097604.0 + "value": 5620.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837512.0 + "value": 1997420.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.376720560372675 + "value": 56.36826607304675 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2145098.0 + "value": 22818711.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099167.0 + "value": 138423860.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.188269466893976 + "value": 0.6389366602704918 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.17444185491542594 + "value": 0.02977053718860498 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -17808,25 +18887,272 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", + "value": 136314880.0 + }, + { + "name": "sm__warps_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 91.90951838698602 + }, + { + "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 98.70912079509333 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", + "type": "Double", + "unit": "", "value": 0.0 }, + { + "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", + "type": "Double", + "unit": "", + "value": 4630511616.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", + "type": "Double", + "unit": "", + "value": 1958739968.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", + "type": "Double", + "unit": "", + "value": 536870912.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", + "type": "Double", + "unit": "", + "value": 1178599424.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", + "type": "Double", + "unit": "", + "value": 143654912.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", + "type": "Double", + "unit": "", + "value": 3145728.0 + }, + { + "name": "smsp__inst_executed.sum", + "type": "Double", + "unit": "", + "value": 266371072.0 + }, + { + "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 9.14357545576909 + }, + { + "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 20.589134239792802 + }, + { + "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 5.665027902403928 + }, + { + "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 20.431128290977032 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.ratio", + "type": "Double", + "unit": "", + "value": 32.0 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.pct", + "type": "Double", + "unit": "", + "value": 100.0 + } + ], + "objectives": [ + "time" + ], + "times": { + "compilation_time": 14672.188, + "data": 58052.427, + "framework": 550303.899, + "kernel_overhead": 195800.001, + "profiling_overhead": 48034.79, + "profiling_runs": 248416.681, + "runtimes": [ + 8292.8 + ], + "search_algorithm": 26.662, + "validation": 14.565 + }, + "timestamp": "2026-03-13 09:40:22 UTC" + }, + { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, + "configuration": { + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 + }, + "correctness": 1, + "invalidity": "correct", + "measurements": [ + { + "name": "time", + "unit": "", + "value": 9450.464 + }, + { + "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 6.028753431499242 + }, + { + "name": "dram__sectors_read.sum", + "type": "Double", + "unit": "", + "value": 12060.0 + }, + { + "name": "dram__sectors_write.sum", + "type": "Double", + "unit": "", + "value": 2197400.0 + }, + { + "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 81.82638758017924 + }, + { + "name": "lts__t_sectors_op_read.sum", + "type": "Double", + "unit": "", + "value": 133008102.0 + }, + { + "name": "lts__t_sectors_op_write.sum", + "type": "Double", + "unit": "", + "value": 138417326.0 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.27384396277109796 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", + "type": "Double", + "unit": "", + "value": 1048576.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 0.012808863199631511 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", + "type": "Double", + "unit": "", + "value": 136314880.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", + "type": "Double", + "unit": "", + "value": 136314880.0 + }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.71261971373654 + "value": 98.74788394591457 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.94750210315436 + "value": 99.3040329380209 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -17838,7 +19164,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -17850,43 +19176,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2373976064.0 + "value": 1817706496.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 1145044992.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 2042626048.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 379191296.0 + "value": 252264448.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 36.85614393964615 + "value": 7.264286335596472 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -17898,13 +19224,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 59.57409495804985 + "value": 17.6109341898316 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.414379824960028 + "value": 4.701551888813685 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -17916,7 +19242,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.15496882561384 + "value": 16.550332954734046 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -17935,30 +19261,47 @@ "time" ], "times": { - "compilation": 15376.344, - "data": 66867.848, - "framework": 1195523.302, - "kernel_overhead": 523466.175, - "profiling_overhead": 55649.026, - "profiling_runs": 549540.253, + "compilation_time": 14740.728, + "data": 58912.704, + "framework": 534258.179, + "kernel_overhead": 183703.68, + "profiling_overhead": 48838.614, + "profiling_runs": 242803.181, "runtimes": [ - 2801.344 + 9450.464 ], - "search_algorithm": 25.482, - "validation": 18.375 + "search_algorithm": 23.36, + "validation": 15.664 }, - "timestamp": "2026-01-27 09:25:34 UTC" + "timestamp": "2026-03-13 09:40:22 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -17966,61 +19309,61 @@ { "name": "time", "unit": "", - "value": 2861.184 + "value": 2589.088 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 36.43379093273437 + "value": 18.566369949170817 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2105208.0 + "value": 10820.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839668.0 + "value": 1840660.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.3401955652016495 + "value": 2.415925107142998 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2153262.0 + "value": 56235.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100642.0 + "value": 2102559.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.188551796662393 + "value": 4.06699430780362 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.17444568103200633 + "value": 0.19055415545841586 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -18050,13 +19393,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.62704268881774 + "value": 98.57617011699715 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9501749815961 + "value": 99.94819289514302 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -18068,7 +19411,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -18080,7 +19423,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2373976064.0 + "value": 2776629248.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -18092,13 +19435,13 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 2042626048.0 + "value": 1642070016.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -18110,13 +19453,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 379191296.0 + "value": 378208256.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 36.855066987314295 + "value": 45.24841739539231 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -18128,13 +19471,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 59.57380845328392 + "value": 65.07619915787394 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.414368213682893 + "value": 2.383161590254172 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -18146,7 +19489,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.15460383150672 + "value": 91.68902941293263 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -18165,30 +19508,47 @@ "time" ], "times": { - "compilation": 14219.159, - "data": 63365.402, - "framework": 1194193.2280000001, - "kernel_overhead": 526143.782, - "profiling_overhead": 53078.85, - "profiling_runs": 551605.194, + "compilation_time": 16245.419, + "data": 57776.847, + "framework": 1083578.5869999998, + "kernel_overhead": 476823.378, + "profiling_overhead": 47514.37, + "profiling_runs": 501463.992, "runtimes": [ - 2861.184 + 2589.088 ], - "search_algorithm": 22.927, - "validation": 12.258 + "search_algorithm": 25.207, + "validation": 16.091 }, - "timestamp": "2026-01-27 09:25:35 UTC" + "timestamp": "2026-03-13 09:40:23 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -18196,61 +19556,61 @@ { "name": "time", "unit": "", - "value": 2808.704 + "value": 3303.712 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 36.28551922216729 + "value": 14.425315015750787 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104096.0 + "value": 7096.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1841632.0 + "value": 1839252.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.320918153739336 + "value": 1.908622369561957 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2151975.0 + "value": 64433.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101467.0 + "value": 2101960.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.194773628211976 + "value": 1.5934855191918345 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.1747326592446524 + "value": 0.07466932027948979 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -18280,13 +19640,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.10698799271498 + "value": 98.58872206402665 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96446832419534 + "value": 99.96150755674424 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -18298,7 +19658,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -18310,7 +19670,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2373976064.0 + "value": 3234856960.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -18322,31 +19682,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 2042626048.0 + "value": 1294991360.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 379191296.0 + "value": 500957184.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 36.91103495095188 + "value": 45.37353463878001 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -18358,13 +19718,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 59.663280385535685 + "value": 50.99388476962389 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.4179942734372375 + "value": 1.0333233486032185 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -18376,7 +19736,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.28100885015861 + "value": 95.16611334939863 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -18395,30 +19755,47 @@ "time" ], "times": { - "compilation": 14268.95, - "data": 61963.826, - "framework": 1196010.19, - "kernel_overhead": 528228.426, - "profiling_overhead": 51792.96, - "profiling_runs": 554024.978, + "compilation_time": 14384.591, + "data": 57535.706, + "framework": 1835294.1860000002, + "kernel_overhead": 851160.555, + "profiling_overhead": 47265.868, + "profiling_runs": 879332.057, "runtimes": [ - 2808.704 + 3303.712 ], - "search_algorithm": 31.324, - "validation": 13.525 + "search_algorithm": 24.752, + "validation": 14.596 }, - "timestamp": "2026-01-27 09:25:36 UTC" + "timestamp": "2026-03-13 09:40:24 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 56 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -18426,61 +19803,61 @@ { "name": "time", "unit": "", - "value": 3666.016 + "value": 5577.856 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 28.265057139211898 + "value": 8.782946639617906 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103612.0 + "value": 9884.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1866412.0 + "value": 1835496.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.322329525460024 + "value": 1.156534322130982 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2153005.0 + "value": 98129.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100462.0 + "value": 2100808.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.7621289763948698 + "value": 0.47671759322173357 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06606553907931477 + "value": 0.02234203754000494 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -18510,13 +19887,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.6743710268328 + "value": 73.86752547447288 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96510093812205 + "value": 99.97930025131156 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -18528,7 +19905,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -18540,7 +19917,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2695888896.0 + "value": 4706533376.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -18552,31 +19929,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1763704832.0 + "value": 1121452032.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 499810304.0 + "value": 852869120.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.045159254851267 + "value": 42.98940273418644 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -18588,13 +19965,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.11648657537932 + "value": 30.510644214629064 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.0904619558014044 + "value": 0.3687199435117526 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -18606,7 +19983,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.00472599667323 + "value": 96.93844030301825 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -18625,30 +20002,47 @@ "time" ], "times": { - "compilation": 18493.806, - "data": 61099.101, - "framework": 1899579.83, - "kernel_overhead": 878855.474, - "profiling_overhead": 50990.259, - "profiling_runs": 908634.996, + "compilation_time": 14983.356, + "data": 59170.929, + "framework": 3073695.7309999997, + "kernel_overhead": 1463222.439, + "profiling_overhead": 49105.928, + "profiling_runs": 1502196.435, "runtimes": [ - 3666.016 + 5577.856 ], - "search_algorithm": 27.621, - "validation": 14.677 + "search_algorithm": 37.82, + "validation": 18.067 }, - "timestamp": "2026-01-27 09:25:37 UTC" + "timestamp": "2026-03-13 09:40:25 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -18656,61 +20050,61 @@ { "name": "time", "unit": "", - "value": 3671.104 + "value": 3281.44 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 28.158494074691504 + "value": 14.768392388047449 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2106300.0 + "value": 5056.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1871896.0 + "value": 1837132.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.3227098294226027 + "value": 1.8995463187179429 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2155322.0 + "value": 58984.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2108226.0 + "value": 2100639.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.7620899005705497 + "value": 1.587539717221002 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06606345414124794 + "value": 0.07439086576062935 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -18740,13 +20134,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.61195866505611 + "value": 98.68425923730632 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96876579107679 + "value": 99.96188077307899 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -18758,7 +20152,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -18770,25 +20164,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2695888896.0 + "value": 2295332864.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1763704832.0 + "value": 1429209088.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -18800,13 +20194,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 499810304.0 + "value": 471597056.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.043106304450323 + "value": 42.028416342916906 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -18818,13 +20212,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.11340884346248 + "value": 50.80353027224467 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.090387567261422 + "value": 1.0294660675283955 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -18836,7 +20230,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 83.99902733951626 + "value": 89.25423271457413 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -18855,30 +20249,47 @@ "time" ], "times": { - "compilation": 18846.761, - "data": 60567.184, - "framework": 1899126.4789999998, - "kernel_overhead": 878996.69, - "profiling_overhead": 50530.71, - "profiling_runs": 909031.895, + "compilation_time": 15309.066, + "data": 57230.757, + "framework": 1662959.583, + "kernel_overhead": 765418.246, + "profiling_overhead": 46997.44, + "profiling_runs": 793313.14, "runtimes": [ - 3671.104 + 3281.44 ], - "search_algorithm": 24.055, - "validation": 15.039 + "search_algorithm": 25.39, + "validation": 15.594 }, - "timestamp": "2026-01-27 09:25:38 UTC" + "timestamp": "2026-03-13 09:40:26 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -18886,61 +20297,61 @@ { "name": "time", "unit": "", - "value": 3717.056 + "value": 6350.656 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.86199681915395 + "value": 7.770033259722073 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097616.0 + "value": 8304.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837764.0 + "value": 1838204.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.308022406421817 + "value": 1.0284505501879089 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2159606.0 + "value": 112179.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099295.0 + "value": 2108940.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.7625615794548404 + "value": 0.4193071874215905 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.0660808658142192 + "value": 0.019652238541583836 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -18970,13 +20381,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.69783736274734 + "value": 81.97197462420063 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96755564871481 + "value": 99.97820289586483 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -18988,7 +20399,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -19000,43 +20411,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2695888896.0 + "value": 6215958528.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1763704832.0 + "value": 1355284480.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 499810304.0 + "value": 936706048.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.05162608307581 + "value": 42.84488642392211 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -19048,13 +20459,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.125845183572075 + "value": 26.837706198209982 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.090688152630282 + "value": 0.3243326310574693 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -19066,7 +20477,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.02215245133232 + "value": 93.65070669193656 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -19085,30 +20496,47 @@ "time" ], "times": { - "compilation": 14712.001, - "data": 61656.17, - "framework": 1899959.756, - "kernel_overhead": 878540.952, - "profiling_overhead": 51360.132, - "profiling_runs": 908402.502, + "compilation_time": 14909.53, + "data": 57943.266, + "framework": 3557017.2530000005, + "kernel_overhead": 1704352.269, + "profiling_overhead": 47902.81, + "profiling_runs": 1746818.908, "runtimes": [ - 3717.056 + 6350.656 ], - "search_algorithm": 27.327, - "validation": 13.571 + "search_algorithm": 24.478, + "validation": 14.822 }, - "timestamp": "2026-01-27 09:25:39 UTC" + "timestamp": "2026-03-13 09:40:28 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 16, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -19116,61 +20544,61 @@ { "name": "time", "unit": "", - "value": 3711.424 + "value": 5890.464 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.927439599079264 + "value": 8.17588452067444 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104380.0 + "value": 784.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1840172.0 + "value": 1835156.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.3140920042058473 + "value": 1.0730006440710291 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2165102.0 + "value": 96089.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101521.0 + "value": 2099897.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.762591124062999 + "value": 0.4408700479849637 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06608261423854468 + "value": 0.020661458571186158 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -19200,13 +20628,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.6629372650336 + "value": 81.97614854706899 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97183939784368 + "value": 99.97239316892632 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -19218,7 +20646,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -19230,43 +20658,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2695888896.0 + "value": 5813305344.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 101187584.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1763704832.0 + "value": 986185728.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 499810304.0 + "value": 897892352.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.050960248879893 + "value": 43.945559484850335 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -19278,13 +20706,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.12510548827567 + "value": 28.217568078878802 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.0906702742527568 + "value": 0.3410082079844972 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -19296,7 +20724,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.02080593733523 + "value": 94.49766199026737 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -19315,30 +20743,47 @@ "time" ], "times": { - "compilation": 18222.788, - "data": 62183.257, - "framework": 1901114.846, - "kernel_overhead": 878971.023, - "profiling_overhead": 51190.519, - "profiling_runs": 908770.047, + "compilation_time": 14889.316, + "data": 58551.531, + "framework": 3468168.438, + "kernel_overhead": 1660172.303, + "profiling_overhead": 48494.321, + "profiling_runs": 1700950.283, "runtimes": [ - 3711.424 + 5890.464 ], - "search_algorithm": 34.527, - "validation": 14.735 + "search_algorithm": 37.604, + "validation": 17.681 }, - "timestamp": "2026-01-27 09:25:40 UTC" + "timestamp": "2026-03-13 09:40:30 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 256 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 44 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 1 }, "correctness": 1, "invalidity": "correct", @@ -19346,61 +20791,61 @@ { "name": "time", "unit": "", - "value": 3708.608 + "value": 3676.544 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.87200060735718 + "value": 13.040931456482166 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098084.0 + "value": 10800.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837496.0 + "value": 1841488.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.308816262721178 + "value": 1.6927700555485554 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2158284.0 + "value": 71790.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100062.0 + "value": 2105194.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.7631040283635486 + "value": 22.54942837054676 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06610059070409878 + "value": 1.0566778733024913 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -19430,13 +20875,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.4012568088631 + "value": 72.35160803913173 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96104560881587 + "value": 99.95950820364654 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -19448,7 +20893,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 8589934592.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -19460,43 +20905,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2695888896.0 + "value": 587202560.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 117440512.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1763704832.0 + "value": 234881024.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 50331648.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 499810304.0 + "value": 341311488.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.06306455197106 + "value": 40.08077323559749 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -19508,13 +20953,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.14225481119326 + "value": 45.103185684335855 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.091084772047884 + "value": 11.980533697401711 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -19526,7 +20971,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.05274489886828 + "value": 57.34866727810712 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -19545,30 +20990,47 @@ "time" ], "times": { - "compilation": 18400.724, - "data": 60909.469, - "framework": 1894303.9980000001, - "kernel_overhead": 876433.859, - "profiling_overhead": 50456.812, - "profiling_runs": 906503.858, + "compilation_time": 15031.125, + "data": 57539.871, + "framework": 242777.946, + "kernel_overhead": 53876.744, + "profiling_overhead": 47201.804, + "profiling_runs": 84159.527, "runtimes": [ - 3708.608 + 3676.544 ], - "search_algorithm": 26.48, - "validation": 17.314 + "search_algorithm": 23.131, + "validation": 14.219 }, - "timestamp": "2026-01-27 09:25:41 UTC" + "timestamp": "2026-03-13 09:40:30 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -19576,61 +21038,61 @@ { "name": "time", "unit": "", - "value": 6304.608 + "value": 2117.824 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.557134335989094 + "value": 22.541269710906704 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2113556.0 + "value": 5360.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1865836.0 + "value": 1839028.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.966928250302347 + "value": 2.895358781142724 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2206771.0 + "value": 40711.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102215.0 + "value": 2100383.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.6214888179871144 + "value": 19.82883176662523 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019417370177624693 + "value": 0.9288732224683685 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -19660,13 +21122,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 73.8036460324469 + "value": 90.47631412796376 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98478721814315 + "value": 99.90820604366661 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -19678,7 +21140,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 6442450944.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -19690,43 +21152,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4333240320.0 + "value": 704643072.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 142606336.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1492647936.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 853852160.0 + "value": 256901120.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.010784063511068 + "value": 50.279944035428194 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -19738,13 +21200,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.515216451221175 + "value": 79.33667458973002 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.42401041932494793 + "value": 10.691856536506586 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -19756,7 +21218,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.34126601983543 + "value": 75.92878576509746 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -19775,30 +21237,47 @@ "time" ], "times": { - "compilation": 16326.723, - "data": 62618.484, - "framework": 3040494.232, - "kernel_overhead": 1441899.219, - "profiling_overhead": 51554.365, - "profiling_runs": 1484422.164, + "compilation_time": 14753.418, + "data": 57794.656, + "framework": 253746.623, + "kernel_overhead": 62801.203, + "profiling_overhead": 48161.41, + "profiling_runs": 84989.354, "runtimes": [ - 6304.608 + 2117.824 ], - "search_algorithm": 31.891, - "validation": 18.106 + "search_algorithm": 36.239, + "validation": 16.702 }, - "timestamp": "2026-01-27 09:25:42 UTC" + "timestamp": "2026-03-13 09:40:30 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 39 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -19806,61 +21285,61 @@ { "name": "time", "unit": "", - "value": 6214.304 + "value": 1915.424 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.713102288454728 + "value": 25.787899533129828 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2112876.0 + "value": 660.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1875036.0 + "value": 1837584.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9836119355036776 + "value": 3.3015241077249216 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2209935.0 + "value": 30642.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2110584.0 + "value": 2099355.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.6214214825981468 + "value": 11.380418205211294 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01941754016363639 + "value": 0.5330977892580449 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -19890,13 +21369,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 73.76530176446796 + "value": 94.30376504681475 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97475941007212 + "value": 99.88919487288169 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -19908,7 +21387,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 5368709120.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -19920,43 +21399,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4333240320.0 + "value": 390070272.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 71303168.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1492647936.0 + "value": 88080384.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 853852160.0 + "value": 197394432.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.013906053133496 + "value": 46.20739680292629 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -19968,13 +21447,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.51810816368978 + "value": 91.08294727555143 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.424056661309004 + "value": 6.3153215396134295 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -19986,7 +21465,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.35046416049614 + "value": 66.9794126999452 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -20005,30 +21484,47 @@ "time" ], "times": { - "compilation": 17162.706, - "data": 62594.818, - "framework": 3045226.398, - "kernel_overhead": 1443823.312, - "profiling_overhead": 52503.207, - "profiling_runs": 1486305.061, + "compilation_time": 14997.234, + "data": 57158.142, + "framework": 195996.523, + "kernel_overhead": 35286.289, + "profiling_overhead": 47364.183, + "profiling_runs": 56187.909, "runtimes": [ - 6214.304 + 1915.424 ], - "search_algorithm": 26.385, - "validation": 18.569 + "search_algorithm": 22.391, + "validation": 13.607 }, - "timestamp": "2026-01-27 09:25:44 UTC" + "timestamp": "2026-03-13 09:40:30 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -20036,61 +21532,61 @@ { "name": "time", "unit": "", - "value": 6226.144 + "value": 1801.76 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.479711158260198 + "value": 26.551443353112447 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2099136.0 + "value": 5428.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837896.0 + "value": 1838848.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9746111012742023 + "value": 3.4691322084737455 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2200854.0 + "value": 34716.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099054.0 + "value": 2100488.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.6215698863351113 + "value": 5.875582395554313 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019420850881718958 + "value": 0.27522587481590294 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -20120,13 +21616,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 73.81699622124492 + "value": 97.17111416997169 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97877661978637 + "value": 99.898441445012 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -20138,7 +21634,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4831838208.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -20150,43 +21646,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4333240320.0 + "value": 362807296.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 69206016.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1492647936.0 + "value": 127926272.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 853852160.0 + "value": 173801472.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.01765199415382 + "value": 43.20921280112353 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -20198,13 +21694,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.521563842839917 + "value": 94.0392701944256 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.4241119218032262 + "value": 3.4438209299716402 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -20216,7 +21712,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.36145620180258 + "value": 60.888001193846705 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -20235,30 +21731,47 @@ "time" ], "times": { - "compilation": 15503.054, - "data": 62661.1, - "framework": 3040916.04, - "kernel_overhead": 1441631.324, - "profiling_overhead": 52709.826, - "profiling_runs": 1483913.79, + "compilation_time": 15896.335, + "data": 57649.619, + "framework": 185313.43899999998, + "kernel_overhead": 29796.376, + "profiling_overhead": 47287.241, + "profiling_runs": 50580.203, "runtimes": [ - 6226.144 + 1801.76 ], - "search_algorithm": 32.112, - "validation": 14.497 + "search_algorithm": 27.185, + "validation": 17.711 }, - "timestamp": "2026-01-27 09:25:45 UTC" + "timestamp": "2026-03-13 09:40:30 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -20266,61 +21779,61 @@ { "name": "time", "unit": "", - "value": 6259.232 + "value": 1778.56 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.42779001448874 + "value": 27.51541641092527 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109036.0 + "value": 524.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838512.0 + "value": 1835692.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.968960238684181 + "value": 3.535438803144819 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2210881.0 + "value": 29258.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101773.0 + "value": 2099303.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.6215651677387657 + "value": 3.003679935882598 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019420915953489055 + "value": 0.14069889611823105 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -20350,13 +21863,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 73.7987858252636 + "value": 98.02594052159382 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9778787382853 + "value": 99.91443336176012 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -20368,7 +21881,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4563402752.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -20380,43 +21893,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4333240320.0 + "value": 209715200.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 67108864.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1492647936.0 + "value": 160432128.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 853852160.0 + "value": 163381248.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.0180499372562 + "value": 39.64005106078096 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -20428,13 +21941,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.521890892059645 + "value": 96.1327039397227 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.4241171517162858 + "value": 1.9480015690910606 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -20446,7 +21959,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.36249650050402 + "value": 58.51174887300367 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -20465,30 +21978,47 @@ "time" ], "times": { - "compilation": 14774.667, - "data": 61352.466, - "framework": 3038709.1720000003, - "kernel_overhead": 1441874.415, - "profiling_overhead": 51082.52, - "profiling_runs": 1484399.771, + "compilation_time": 15772.63, + "data": 57124.496, + "framework": 191622.517, + "kernel_overhead": 33372.821, + "profiling_overhead": 47253.896, + "profiling_runs": 53871.304, "runtimes": [ - 6259.232 + 1778.56 ], - "search_algorithm": 27.751, - "validation": 14.815 + "search_algorithm": 42.291, + "validation": 14.398 }, - "timestamp": "2026-01-27 09:25:47 UTC" + "timestamp": "2026-03-13 09:40:30 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -20496,61 +22026,61 @@ { "name": "time", "unit": "", - "value": 6158.56 + "value": 1810.592 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.591078595859006 + "value": 27.29094279258753 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098012.0 + "value": 1812.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838528.0 + "value": 1838008.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9752618106236735 + "value": 3.5257109843983683 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2199070.0 + "value": 29941.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099902.0 + "value": 2099375.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.6222538154222149 + "value": 1.4974026481070932 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019441488807948784 + "value": 0.07008449493478686 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -20580,13 +22110,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 65.67446551041674 + "value": 81.5524370488381 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96877553000681 + "value": 99.8299315290226 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -20598,7 +22128,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4429185024.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -20610,25 +22140,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4333240320.0 + "value": 138412032.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 33554432.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1492647936.0 + "value": 82837504.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -20640,13 +22170,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 853852160.0 + "value": 150192128.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.05142398482834 + "value": 38.01990359255715 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -20658,13 +22188,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.552403566403466 + "value": 95.85171062295778 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.42460508632798516 + "value": 1.1583641786709988 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -20676,7 +22206,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.45952306140065 + "value": 53.631101120092474 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -20695,260 +22225,47 @@ "time" ], "times": { - "compilation": 18554.509, - "data": 64526.241, - "framework": 3046114.266, - "kernel_overhead": 1442807.746, - "profiling_overhead": 53504.151, - "profiling_runs": 1485276.128, + "compilation_time": 15786.739, + "data": 56888.519, + "framework": 171378.13999999998, + "kernel_overhead": 23334.802, + "profiling_overhead": 47257.214, + "profiling_runs": 43897.605, "runtimes": [ - 6158.56 + 1810.592 ], - "search_algorithm": 24.601, - "validation": 21.77 + "search_algorithm": 35.802, + "validation": 15.892 }, - "timestamp": "2026-01-27 09:25:49 UTC" + "timestamp": "2026-03-13 09:40:31 UTC" }, { - "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 19 }, - "correctness": 1, - "invalidity": "correct", - "measurements": [ - { - "name": "time", - "unit": "", - "value": 3352.512 - }, - { - "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 31.6936574528572 - }, - { - "name": "dram__sectors_read.sum", - "type": "Double", - "unit": "", - "value": 2099032.0 - }, - { - "name": "dram__sectors_write.sum", - "type": "Double", - "unit": "", - "value": 1868020.0 - }, - { - "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 3.718127478726528 - }, - { - "name": "lts__t_sectors_op_read.sum", - "type": "Double", - "unit": "", - "value": 2142145.0 - }, - { - "name": "lts__t_sectors_op_write.sum", - "type": "Double", - "unit": "", - "value": 2099918.0 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 1.9773097848940935 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", - "type": "Double", - "unit": "", - "value": 2621440.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 0.07413490126718918 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "sm__warps_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 98.6127517884258 - }, - { - "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 99.97147656958843 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", - "type": "Double", - "unit": "", - "value": 4647288832.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", - "type": "Double", - "unit": "", - "value": 1824522240.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", - "type": "Double", - "unit": "", - "value": 134217728.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", - "type": "Double", - "unit": "", - "value": 100663296.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", - "type": "Double", - "unit": "", - "value": 1898971136.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", - "type": "Double", - "unit": "", - "value": 3145728.0 - }, - { - "name": "smsp__inst_executed.sum", - "type": "Double", - "unit": "", - "value": 472612864.0 - }, - { - "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 37.60965139703659 - }, - { - "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 50.62386559480905 - }, - { - "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 1.223574876437035 - }, - { - "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 89.13005208715167 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.ratio", - "type": "Double", - "unit": "", - "value": 32.0 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.pct", - "type": "Double", - "unit": "", - "value": 100.0 - } - ], - "objectives": [ - "time" - ], - "times": { - "compilation": 14930.161, - "data": 63105.907, - "framework": 1837364.4789999998, - "kernel_overhead": 846559.989, - "profiling_overhead": 53147.856, - "profiling_runs": 874550.727, - "runtimes": [ - 3352.512 - ], - "search_algorithm": 26.164, - "validation": 14.244 - }, - "timestamp": "2026-01-27 09:25:50 UTC" - }, - { "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -20956,61 +22273,61 @@ { "name": "time", "unit": "", - "value": 3454.592 + "value": 5348.8 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 30.988912277022035 + "value": 9.264298551898865 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2100020.0 + "value": 6212.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1871400.0 + "value": 1837920.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.6598965531556136 + "value": 1.207515679406562 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2145650.0 + "value": 90079.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100005.0 + "value": 2100718.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.9771926531315134 + "value": 7.958273511860672 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.0741296999484567 + "value": 0.3729312405369703 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -21040,13 +22357,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.52416352289737 + "value": 96.5919553059146 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96670388484752 + "value": 99.95806105022952 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -21058,7 +22375,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 6979321856.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -21070,43 +22387,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1824522240.0 + "value": 8212447232.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1898971136.0 + "value": 3321888768.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 472612864.0 + "value": 715390976.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.608466443864415 + "value": 36.28260351733506 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -21118,13 +22435,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 50.62273056747618 + "value": 31.83681788621662 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.2235474429150737 + "value": 4.290508660447161 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -21136,7 +22453,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 89.1280869113384 + "value": 84.84687565046562 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -21155,30 +22472,47 @@ "time" ], "times": { - "compilation": 15177.486, - "data": 62440.539, - "framework": 1836270.649, - "kernel_overhead": 846535.423, - "profiling_overhead": 52259.061, - "profiling_runs": 875035.626, + "compilation_time": 15855.153, + "data": 57489.225, + "framework": 2111278.6070000003, + "kernel_overhead": 983928.701, + "profiling_overhead": 47818.063, + "profiling_runs": 1022042.618, "runtimes": [ - 3454.592 + 5348.8 ], - "search_algorithm": 26.978, - "validation": 17.404 + "search_algorithm": 23.372, + "validation": 14.232 }, - "timestamp": "2026-01-27 09:25:51 UTC" + "timestamp": "2026-03-13 09:40:32 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 22 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -21186,61 +22520,61 @@ { "name": "time", "unit": "", - "value": 3391.776 + "value": 5766.336 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 30.925274907526536 + "value": 8.377835586869757 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2100468.0 + "value": 4868.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837700.0 + "value": 1838288.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.6924934678009502 + "value": 1.0960094059460839 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2156064.0 + "value": 97312.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099956.0 + "value": 2100424.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.9782882591877593 + "value": 3.6210682863864716 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.07416164389712679 + "value": 0.16969768112289374 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -21270,13 +22604,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.66213612815854 + "value": 98.59914054366196 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96414170148827 + "value": 99.97134774266385 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -21288,7 +22622,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -21300,43 +22634,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1824522240.0 + "value": 10049552384.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1898971136.0 + "value": 1392508928.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 472612864.0 + "value": 786956288.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.62596041333609 + "value": 33.79410762566248 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -21348,13 +22682,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 50.64584297132833 + "value": 28.9700381481878 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.2241060679105236 + "value": 2.0086647544153653 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -21366,7 +22700,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 89.16874467376196 + "value": 84.93014609731566 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -21385,30 +22719,47 @@ "time" ], "times": { - "compilation": 15273.741, - "data": 60389.35, - "framework": 1827119.264, - "kernel_overhead": 844111.384, - "profiling_overhead": 50346.074, - "profiling_runs": 872272.456, + "compilation_time": 14094.86, + "data": 57985.872, + "framework": 2699925.7520000003, + "kernel_overhead": 1277035.939, + "profiling_overhead": 47752.929, + "profiling_runs": 1317151.012, "runtimes": [ - 3391.776 + 5766.336 ], - "search_algorithm": 34.912, - "validation": 13.158 + "search_algorithm": 35.035, + "validation": 15.817 }, - "timestamp": "2026-01-27 09:25:52 UTC" + "timestamp": "2026-03-13 09:40:33 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 32, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -21416,61 +22767,61 @@ { "name": "time", "unit": "", - "value": 3310.432 + "value": 8045.503 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 31.284769542135344 + "value": 6.12388964583231 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098040.0 + "value": 6252.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839376.0 + "value": 1909528.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.713781321740659 + "value": 49.135414583517814 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2155531.0 + "value": 708460.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103083.0 + "value": 138415765.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.9782273495500273 + "value": 1.2786563572445209 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.07416024745188818 + "value": 0.06002141633300066 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -21488,25 +22839,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.61443842799194 + "value": 96.4582943423236 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96164909444593 + "value": 100.22835303966491 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -21518,7 +22869,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -21530,43 +22881,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1824522240.0 + "value": 3982491648.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 2319450112.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1898971136.0 + "value": 155189248.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 472612864.0 + "value": 395247616.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.62609995035257 + "value": 10.689245919838248 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -21578,13 +22929,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 50.64615218514697 + "value": 20.440633300861617 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.2241135415843625 + "value": 11.068682778640396 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -21596,7 +22947,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 89.16932380140665 + "value": 30.097364647198805 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -21615,30 +22966,47 @@ "time" ], "times": { - "compilation": 15418.354, - "data": 62444.449, - "framework": 1837252.312, - "kernel_overhead": 847358.958, - "profiling_overhead": 52063.188, - "profiling_runs": 875385.717, + "compilation_time": 16156.747, + "data": 59548.319, + "framework": 914479.264, + "kernel_overhead": 376537.66, + "profiling_overhead": 49060.053, + "profiling_runs": 429333.232, "runtimes": [ - 3310.432 + 8045.503 ], - "search_algorithm": 26.347, - "validation": 12.992 + "search_algorithm": 23.061, + "validation": 16.064 }, - "timestamp": "2026-01-27 09:25:53 UTC" + "timestamp": "2026-03-13 09:40:34 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -21646,61 +23014,61 @@ { "name": "time", "unit": "", - "value": 3388.544 + "value": 8259.104 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 31.14291326448118 + "value": 6.445551580227238 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2105564.0 + "value": 8664.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839112.0 + "value": 1999840.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.715189142128396 + "value": 51.717233920607455 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2158951.0 + "value": 8481421.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100527.0 + "value": 138419231.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.9889150066673613 + "value": 0.6435627625105395 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 2621440.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.07456076319931206 + "value": 0.02995279217647109 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -21718,25 +23086,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.37760385932948 + "value": 85.2278319074976 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95829185253764 + "value": 99.08454204233192 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -21748,7 +23116,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -21760,25 +23128,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1824522240.0 + "value": 3635412992.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 100663296.0 + "value": 2252341248.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1898971136.0 + "value": 77594624.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -21790,13 +23158,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 472612864.0 + "value": 367034368.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.83017846478061 + "value": 9.326136681167359 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -21808,13 +23176,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 50.92138604417822 + "value": 20.636693041115358 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.2307659224545027 + "value": 10.837286799667755 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -21826,7 +23194,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 89.65389274436443 + "value": 28.217085541455482 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -21845,260 +23213,47 @@ "time" ], "times": { - "compilation": 15701.298, - "data": 61357.454, - "framework": 1833858.245, - "kernel_overhead": 846653.398, - "profiling_overhead": 51111.98, - "profiling_runs": 874735.413, + "compilation_time": 14791.697, + "data": 59562.649, + "framework": 880938.26, + "kernel_overhead": 359564.884, + "profiling_overhead": 49416.463, + "profiling_runs": 412394.264, "runtimes": [ - 3388.544 + 8259.104 ], - "search_algorithm": 38.549, - "validation": 17.811 + "search_algorithm": 24.176, + "validation": 16.915 }, - "timestamp": "2026-01-27 09:25:54 UTC" + "timestamp": "2026-03-13 09:40:34 UTC" }, { - "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 }, - "correctness": 1, - "invalidity": "correct", - "measurements": [ - { - "name": "time", - "unit": "", - "value": 6701.28 - }, - { - "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 15.5538044403497 - }, - { - "name": "dram__sectors_read.sum", - "type": "Double", - "unit": "", - "value": 2107732.0 - }, - { - "name": "dram__sectors_write.sum", - "type": "Double", - "unit": "", - "value": 1868348.0 - }, - { - "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 1.8498570383041355 - }, - { - "name": "lts__t_sectors_op_read.sum", - "type": "Double", - "unit": "", - "value": 2208852.0 - }, - { - "name": "lts__t_sectors_op_write.sum", - "type": "Double", - "unit": "", - "value": 2101491.0 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.5843098677777812 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", - "type": "Double", - "unit": "", - "value": 1572864.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 0.01825583990769391 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "sm__warps_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 81.94893309030182 - }, - { - "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 99.97703934059217 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", - "type": "Double", - "unit": "", - "value": 4479516672.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", - "type": "Double", - "unit": "", - "value": 6415712256.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", - "type": "Double", - "unit": "", - "value": 134217728.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", - "type": "Double", - "unit": "", - "value": 67108864.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", - "type": "Double", - "unit": "", - "value": 1155006464.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", - "type": "Double", - "unit": "", - "value": 1572864.0 - }, - { - "name": "smsp__inst_executed.sum", - "type": "Double", - "unit": "", - "value": 937738240.0 - }, - { - "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 35.073083052459225 - }, - { - "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 24.93103108310527 - }, - { - "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.39867737693930544 - }, - { - "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 87.09316291017713 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.ratio", - "type": "Double", - "unit": "", - "value": 32.0 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.pct", - "type": "Double", - "unit": "", - "value": 100.0 - } - ], - "objectives": [ - "time" - ], - "times": { - "compilation": 14840.152, - "data": 60953.569, - "framework": 3524944.984, - "kernel_overhead": 1684483.366, - "profiling_overhead": 50478.425, - "profiling_runs": 1729029.624, - "runtimes": [ - 6701.28 - ], - "search_algorithm": 24.908, - "validation": 15.871 - }, - "timestamp": "2026-01-27 09:25:55 UTC" - }, - { "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -22106,61 +23261,61 @@ { "name": "time", "unit": "", - "value": 6760.384 + "value": 8780.448 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.579809995289967 + "value": 6.545309701688259 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2107272.0 + "value": 33484.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1872664.0 + "value": 2202620.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8634567454569144 + "value": 61.33171293707412 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2211169.0 + "value": 47415751.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2108796.0 + "value": 138422062.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.5842987931919865 + "value": 0.3000275184121926 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.018254982348911722 + "value": 0.013987374527684221 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -22178,25 +23333,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.91233585656829 + "value": 76.12008888082325 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97758925338934 + "value": 98.81929734979549 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -22208,7 +23363,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -22220,25 +23375,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 6415712256.0 + "value": 3461873664.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 2218786816.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1155006464.0 + "value": 38797312.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -22250,13 +23405,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 937738240.0 + "value": 352927744.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 35.071285141369735 + "value": 8.045589900817038 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -22268,13 +23423,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.92972283739663 + "value": 19.325606638210306 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.3986564565062205 + "value": 9.99071583408455 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -22286,7 +23441,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.08859273190212 + "value": 25.408811173261398 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -22305,30 +23460,47 @@ "time" ], "times": { - "compilation": 15309.865, - "data": 60301.286, - "framework": 3525717.507, - "kernel_overhead": 1685225.964, - "profiling_overhead": 50331.175, - "profiling_runs": 1729859.082, + "compilation_time": 14726.061, + "data": 59294.812, + "framework": 865387.493, + "kernel_overhead": 350523.9, + "profiling_overhead": 48945.84, + "profiling_runs": 406622.941, "runtimes": [ - 6760.384 + 8780.448 ], - "search_algorithm": 25.091, - "validation": 15.576 + "search_algorithm": 22.094, + "validation": 17.816 }, - "timestamp": "2026-01-27 09:25:57 UTC" + "timestamp": "2026-03-13 09:40:35 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 23 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -22336,61 +23508,61 @@ { "name": "time", "unit": "", - "value": 6660.224 + "value": 3473.44 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.508509475402239 + "value": 14.020032659696668 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102608.0 + "value": 5944.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838072.0 + "value": 1837540.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8614686532081357 + "value": 1.8047616145495888 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2210993.0 + "value": 61875.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100370.0 + "value": 2100790.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.5843982581033305 + "value": 6.059821416974538 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.018259440798087348 + "value": 0.2839350045785906 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -22420,13 +23592,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.9615626153873 + "value": 97.61281333963794 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9761945029573 + "value": 99.94483386116715 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -22438,7 +23610,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -22450,43 +23622,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 6415712256.0 + "value": 4945084416.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1155006464.0 + "value": 1665138688.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 937738240.0 + "value": 484966400.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 35.08005468944198 + "value": 41.38294054596213 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -22498,13 +23670,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.93615934633101 + "value": 48.48498807724858 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.3987593840782913 + "value": 3.3617521030123525 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -22516,7 +23688,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.11107779958049 + "value": 87.59562757418074 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -22535,30 +23707,47 @@ "time" ], "times": { - "compilation": 14621.572, - "data": 59924.193, - "framework": 3527953.767, - "kernel_overhead": 1686928.427, - "profiling_overhead": 49845.177, - "profiling_runs": 1731255.97, + "compilation_time": 14586.296, + "data": 57243.152, + "framework": 1360315.105, + "kernel_overhead": 613556.265, + "profiling_overhead": 47030.135, + "profiling_runs": 642485.553, "runtimes": [ - 6660.224 + 3473.44 ], - "search_algorithm": 25.681, - "validation": 13.485 + "search_algorithm": 23.974, + "validation": 16.252 }, - "timestamp": "2026-01-27 09:25:59 UTC" + "timestamp": "2026-03-13 09:40:35 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -22566,61 +23755,61 @@ { "name": "time", "unit": "", - "value": 6666.88 + "value": 4127.711 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.508967116399733 + "value": 11.698888142903646 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2118100.0 + "value": 924.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1845724.0 + "value": 1839152.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8607607576535068 + "value": 1.5159965348459281 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2229109.0 + "value": 69672.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2105787.0 + "value": 2102900.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.5844020796763401 + "value": 2.5329366295646727 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.018259502118298392 + "value": 0.11868474928498668 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -22650,13 +23839,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.93438412246586 + "value": 98.05694193438794 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97675927435003 + "value": 99.942124372741 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -22668,7 +23857,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -22680,43 +23869,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 6415712256.0 + "value": 5463080960.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1155006464.0 + "value": 1369440256.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 937738240.0 + "value": 596246528.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 35.07999848155109 + "value": 40.85957669486962 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -22728,13 +23917,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.936102223288916 + "value": 40.534520697385496 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.3987584706116758 + "value": 1.4844184825702695 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -22746,7 +23935,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.11087824800981 + "value": 90.03547867989383 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -22765,30 +23954,47 @@ "time" ], "times": { - "compilation": 15145.04, - "data": 60225.47, - "framework": 3534012.551, - "kernel_overhead": 1689451.138, - "profiling_overhead": 50109.573, - "profiling_runs": 1734226.37, + "compilation_time": 14585.207, + "data": 57362.995, + "framework": 1951134.7340000002, + "kernel_overhead": 907221.305, + "profiling_overhead": 47301.246, + "profiling_runs": 939249.188, "runtimes": [ - 6666.88 + 4127.711 ], - "search_algorithm": 27.845, - "validation": 14.899 + "search_algorithm": 24.154, + "validation": 17.024 }, - "timestamp": "2026-01-27 09:26:1 UTC" + "timestamp": "2026-03-13 09:40:36 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -22796,61 +24002,61 @@ { "name": "time", "unit": "", - "value": 6854.56 + "value": 8125.76 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.6044681785152 + "value": 6.382699748579639 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2115084.0 + "value": 4548.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1843068.0 + "value": 1997132.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.874668476125645 + "value": 53.310037761541466 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2224703.0 + "value": 12879676.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2109479.0 + "value": 138419285.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.5849343415418866 + "value": 0.6405844590323392 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.018275179949843196 + "value": 0.029936737446699852 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -22868,25 +24074,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.92349997665701 + "value": 86.71388132456342 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97894944812764 + "value": 99.6202526347094 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -22898,7 +24104,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -22910,43 +24116,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 6415712256.0 + "value": 1958739968.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 1178599424.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1155006464.0 + "value": 143654912.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 937738240.0 + "value": 266371072.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 35.10944568816628 + "value": 9.11038583367539 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -22958,13 +24164,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.956965937246302 + "value": 20.514716860387914 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.39909210666250794 + "value": 5.644552222084273 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -22976,7 +24182,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.1837754364612 + "value": 20.35727834005307 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -22995,30 +24201,47 @@ "time" ], "times": { - "compilation": 20207.761, - "data": 62147.989, - "framework": 3529478.113, - "kernel_overhead": 1685274.412, - "profiling_overhead": 52019.95, - "profiling_runs": 1730035.762, + "compilation_time": 16380.58, + "data": 58981.219, + "framework": 552243.067, + "kernel_overhead": 196152.587, + "profiling_overhead": 48208.755, + "profiling_runs": 248900.506, "runtimes": [ - 6854.56 + 8125.76 ], - "search_algorithm": 26.676, - "validation": 25.506 + "search_algorithm": 26.987, + "validation": 14.017 }, - "timestamp": "2026-01-27 09:26:3 UTC" + "timestamp": "2026-03-13 09:40:37 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "16", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -23026,61 +24249,61 @@ { "name": "time", "unit": "", - "value": 6167.584 + "value": 9822.048 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.884196079605456 + "value": 6.167041910783482 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103580.0 + "value": 18080.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1866632.0 + "value": 2194256.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.005123991877393 + "value": 79.44415870359379 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2197182.0 + "value": 119613413.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100460.0 + "value": 138421443.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.6352051599499073 + "value": 0.2792457943331886 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01984664205088687 + "value": 0.013009034588194182 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -23098,25 +24321,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.97671475196458 + "value": 94.69213935633907 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96966312844712 + "value": 98.52317582855736 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -23128,7 +24351,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -23140,25 +24363,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5811732480.0 + "value": 1817706496.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 101187584.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 1145044992.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 987234304.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -23170,13 +24393,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 898924544.0 + "value": 252264448.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.29648965208124 + "value": 7.436394387936179 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -23188,13 +24411,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.10550490901888 + "value": 18.027909076596675 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.43344984656756264 + "value": 4.8128707459127105 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -23206,7 +24429,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.87767965534688 + "value": 16.942188684806776 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -23225,260 +24448,47 @@ "time" ], "times": { - "compilation": 16247.971, - "data": 60003.504, - "framework": 3459799.704, - "kernel_overhead": 1653925.048, - "profiling_overhead": 49848.984, - "profiling_runs": 1696022.168, + "compilation_time": 14446.466, + "data": 59320.634, + "framework": 536863.9010000001, + "kernel_overhead": 184566.688, + "profiling_overhead": 49107.514, + "profiling_runs": 243869.065, "runtimes": [ - 6167.584 + 9822.048 ], - "search_algorithm": 28.137, - "validation": 13.909 + "search_algorithm": 30.978, + "validation": 14.178 }, - "timestamp": "2026-01-27 09:26:4 UTC" + "timestamp": "2026-03-13 09:40:37 UTC" }, { - "configuration": { - "INNER_UNROLL_FACTOR": "16", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 }, - "correctness": 1, - "invalidity": "correct", - "measurements": [ - { - "name": "time", - "unit": "", - "value": 6131.936 - }, - { - "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 16.973215361280854 - }, - { - "name": "dram__sectors_read.sum", - "type": "Double", - "unit": "", - "value": 2098148.0 - }, - { - "name": "dram__sectors_write.sum", - "type": "Double", - "unit": "", - "value": 1869576.0 - }, - { - "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 2.014293052831512 - }, - { - "name": "lts__t_sectors_op_read.sum", - "type": "Double", - "unit": "", - "value": 2192694.0 - }, - { - "name": "lts__t_sectors_op_write.sum", - "type": "Double", - "unit": "", - "value": 2099285.0 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.6351765967042945 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", - "type": "Double", - "unit": "", - "value": 1572864.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 0.01984774607910102 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "sm__warps_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 81.93257044255819 - }, - { - "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 99.97355753824115 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", - "type": "Double", - "unit": "", - "value": 4479516672.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", - "type": "Double", - "unit": "", - "value": 5811732480.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", - "type": "Double", - "unit": "", - "value": 101187584.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", - "type": "Double", - "unit": "", - "value": 67108864.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", - "type": "Double", - "unit": "", - "value": 987234304.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", - "type": "Double", - "unit": "", - "value": 1572864.0 - }, - { - "name": "smsp__inst_executed.sum", - "type": "Double", - "unit": "", - "value": 898924544.0 - }, - { - "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 39.297251209720734 - }, - { - "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 27.10595679559264 - }, - { - "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.4334570727810834 - }, - { - "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 90.87919410715813 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.ratio", - "type": "Double", - "unit": "", - "value": 32.0 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.pct", - "type": "Double", - "unit": "", - "value": 100.0 - } - ], - "objectives": [ - "time" - ], - "times": { - "compilation": 17582.338, - "data": 62548.75, - "framework": 3467482.934, - "kernel_overhead": 1655460.517, - "profiling_overhead": 52566.669, - "profiling_runs": 1696906.998, - "runtimes": [ - 6131.936 - ], - "search_algorithm": 25.346, - "validation": 17.49 - }, - "timestamp": "2026-01-27 09:26:6 UTC" - }, - { "configuration": { - "INNER_UNROLL_FACTOR": "16", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -23486,61 +24496,61 @@ { "name": "time", "unit": "", - "value": 6127.68 + "value": 2569.664 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.836506971598812 + "value": 18.792130139140003 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097616.0 + "value": 252.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837504.0 + "value": 1835688.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.017043860853979 + "value": 2.4201583944003566 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2197977.0 + "value": 41340.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099175.0 + "value": 2099098.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.6353684699560179 + "value": 4.07841962173134 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01985092485310267 + "value": 0.19107541888908336 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -23570,13 +24580,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.98752936183982 + "value": 98.31426708659741 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97313651139189 + "value": 99.93919243073574 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -23588,7 +24598,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -23600,43 +24610,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5811732480.0 + "value": 2776629248.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 101187584.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 987234304.0 + "value": 1642070016.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 898924544.0 + "value": 378208256.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.303758370448634 + "value": 45.37643026506167 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -23648,13 +24658,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.110412201929663 + "value": 65.26009272355859 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.433528320123631 + "value": 2.3898959737631316 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -23666,7 +24676,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.89413101207882 + "value": 91.94814644378341 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -23685,30 +24695,47 @@ "time" ], "times": { - "compilation": 15873.069, - "data": 60959.931, - "framework": 3463857.324, - "kernel_overhead": 1655151.421, - "profiling_overhead": 50763.829, - "profiling_runs": 1696982.143, + "compilation_time": 15023.824, + "data": 57341.062, + "framework": 1083610.326, + "kernel_overhead": 477397.596, + "profiling_overhead": 47137.737, + "profiling_runs": 501733.931, "runtimes": [ - 6127.68 + 2569.664 ], - "search_algorithm": 23.922, - "validation": 20.12 + "search_algorithm": 24.901, + "validation": 15.466 }, - "timestamp": "2026-01-27 09:26:8 UTC" + "timestamp": "2026-03-13 09:40:38 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "16", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -23716,61 +24743,61 @@ { "name": "time", "unit": "", - "value": 6148.032 + "value": 3306.56 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.81448862085903 + "value": 14.713832292413228 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103076.0 + "value": 2420.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1840500.0 + "value": 1837068.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.0234475421224705 + "value": 1.9098400446895907 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2207337.0 + "value": 57096.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2108751.0 + "value": 2103671.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.635351905476862 + "value": 1.59355358021411 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019851266710491956 + "value": 0.07467034119934253 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -23800,13 +24827,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.9538040075383 + "value": 98.3678901921875 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97838864579919 + "value": 99.95966484453128 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -23818,7 +24845,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -23830,43 +24857,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5811732480.0 + "value": 3234856960.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 101187584.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 987234304.0 + "value": 1294991360.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 898924544.0 + "value": 500957184.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.302178074540734 + "value": 45.375150850533416 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -23878,13 +24905,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.109454869038675 + "value": 50.99552204851818 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.43351301121143393 + "value": 1.0333565258855002 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -23896,7 +24923,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.89092182989874 + "value": 95.16921143581212 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -23915,30 +24942,47 @@ "time" ], "times": { - "compilation": 15136.492, - "data": 60236.566, - "framework": 3462842.324, - "kernel_overhead": 1655265.842, - "profiling_overhead": 50161.7, - "profiling_runs": 1697178.216, + "compilation_time": 14574.928, + "data": 57463.432, + "framework": 1845392.958, + "kernel_overhead": 856214.361, + "profiling_overhead": 47397.522, + "profiling_runs": 884317.643, "runtimes": [ - 6148.032 + 3306.56 ], - "search_algorithm": 25.4, - "validation": 14.897 + "search_algorithm": 25.305, + "validation": 15.046 }, - "timestamp": "2026-01-27 09:26:10 UTC" + "timestamp": "2026-03-13 09:40:39 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 56 + }, "configuration": { - "INNER_UNROLL_FACTOR": "16", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "0", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -23946,61 +24990,61 @@ { "name": "time", "unit": "", - "value": 6127.008 + "value": 5474.08 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.898960656956653 + "value": 8.777453107289547 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2113656.0 + "value": 6788.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1842680.0 + "value": 1836876.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.0246156641227504 + "value": 1.1645846960726136 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2215133.0 + "value": 100293.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2109759.0 + "value": 2109537.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.636231290062955 + "value": 0.4770953197024484 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019877237996958046 + "value": 0.022355722628357652 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -24030,13 +25074,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.87668246169547 + "value": 65.7579605099478 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96557414596381 + "value": 99.95633079388723 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -24048,7 +25092,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -24060,25 +25104,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5811732480.0 + "value": 4706533376.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 101187584.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 987234304.0 + "value": 1121452032.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -24090,13 +25134,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 898924544.0 + "value": 852869120.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.358902700716406 + "value": 43.02563571652544 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -24108,13 +25152,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.148401680982566 + "value": 30.536348276118996 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.4341358178965718 + "value": 0.36903057609079354 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -24126,7 +25170,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 91.02151751142915 + "value": 97.02007268324498 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -24145,30 +25189,47 @@ "time" ], "times": { - "compilation": 16280.125, - "data": 63785.982, - "framework": 3473838.756, - "kernel_overhead": 1657406.562, - "profiling_overhead": 53476.752, - "profiling_runs": 1699169.46, + "compilation_time": 14485.702, + "data": 57765.688, + "framework": 3076438.659, + "kernel_overhead": 1466110.634, + "profiling_overhead": 47686.888, + "profiling_runs": 1504875.449, "runtimes": [ - 6127.008 + 5474.08 ], - "search_algorithm": 30.252, - "validation": 17.994 + "search_algorithm": 31.983, + "validation": 16.625 }, - "timestamp": "2026-01-27 09:26:12 UTC" + "timestamp": "2026-03-13 09:40:40 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "1" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -24176,61 +25237,61 @@ { "name": "time", "unit": "", - "value": 6867.488 + "value": 3240.512 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 14.17269240442656 + "value": 14.723971928787014 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098048.0 + "value": 8624.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1869036.0 + "value": 1839708.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.7701471849423482 + "value": 1.9106651029471386 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2201040.0 + "value": 63056.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099173.0 + "value": 2106623.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 47.956233192546485 + "value": 1.5960594117261604 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 134742016.0 + "value": 2097152.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5597051413373055 + "value": 0.07479020034132768 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -24260,13 +25321,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 66.16893691135152 + "value": 98.52535531818756 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95139520899937 + "value": 99.95761706054063 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -24278,7 +25339,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 8606711808.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -24290,43 +25351,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2952790016.0 + "value": 2295332864.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 285212672.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 4328521728.0 + "value": 83886080.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 318767104.0 + "value": 1429209088.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 50331648.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 529006592.0 + "value": 471597056.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 21.325843592569584 + "value": 42.25551024595298 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -24338,13 +25399,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.892365531390006 + "value": 51.07842530441994 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.359013295674966 + "value": 1.0350364502604625 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -24356,7 +25417,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 47.085145702381816 + "value": 89.73722494856527 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -24375,30 +25436,47 @@ "time" ], "times": { - "compilation": 20165.277, - "data": 63513.234, - "framework": 520007.766, - "kernel_overhead": 177352.115, - "profiling_overhead": 53331.493, - "profiling_runs": 225810.924, + "compilation_time": 14520.729, + "data": 58542.494, + "framework": 1671651.1439999999, + "kernel_overhead": 768327.674, + "profiling_overhead": 48488.431, + "profiling_runs": 796292.545, "runtimes": [ - 6867.488 + 3240.512 ], - "search_algorithm": 31.523, - "validation": 20.895 + "search_algorithm": 26.552, + "validation": 17.149 }, - "timestamp": "2026-01-27 09:26:12 UTC" + "timestamp": "2026-03-13 09:40:41 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "1" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -24406,61 +25484,61 @@ { "name": "time", "unit": "", - "value": 6940.288 + "value": 6692.736 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 14.884589678712942 + "value": 7.737679127307858 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2112684.0 + "value": 12112.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1878196.0 + "value": 1833244.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.7852167277825322 + "value": 1.0272868550267 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2228589.0 + "value": 114605.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103846.0 + "value": 2102181.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.001785978871006 + "value": 0.4200088916868048 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 134742016.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5602516595925187 + "value": 0.0196833549202028 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -24490,13 +25568,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.39998394347539 + "value": 81.95622202719717 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96367328301893 + "value": 99.96903631908961 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -24508,7 +25586,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 8606711808.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -24520,43 +25598,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2952790016.0 + "value": 6215958528.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 285212672.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 4328521728.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 318767104.0 + "value": 1355284480.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 50331648.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 529006592.0 + "value": 936706048.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 21.344458596856413 + "value": 42.91620457340909 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -24568,13 +25646,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.91275752903074 + "value": 26.882664446823085 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.37980357451962 + "value": 0.324875949735777 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -24586,7 +25664,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 47.1253507782709 + "value": 93.80760361054595 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -24605,30 +25683,47 @@ "time" ], "times": { - "compilation": 22193.589, - "data": 62466.989, - "framework": 516032.46400000004, - "kernel_overhead": 177178.21, - "profiling_overhead": 51678.671, - "profiling_runs": 224708.594, + "compilation_time": 15040.364, + "data": 57880.419, + "framework": 3575988.309, + "kernel_overhead": 1713805.869, + "profiling_overhead": 47670.539, + "profiling_runs": 1756631.482, "runtimes": [ - 6940.288 + 6692.736 ], - "search_algorithm": 36.99, - "validation": 18.936 + "search_algorithm": 26.475, + "validation": 15.541 }, - "timestamp": "2026-01-27 09:26:12 UTC" + "timestamp": "2026-03-13 09:40:43 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "1" + "INNER_UNROLL_FACTOR": 16, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 0, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -24636,61 +25731,61 @@ { "name": "time", "unit": "", - "value": 7035.136 + "value": 5878.688 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 14.434834070485488 + "value": 8.186485294621349 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104288.0 + "value": 4432.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839244.0 + "value": 1831164.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.7823251522435732 + "value": 1.077164795039242 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2218292.0 + "value": 100195.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100070.0 + "value": 2101108.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 47.959798828703555 + "value": 0.44152929626775533 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 134742016.0 + "value": 1048576.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5597260471696622 + "value": 0.02069226539689442 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -24720,13 +25815,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 65.76308105356907 + "value": 81.83386293428686 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95129837778576 + "value": 99.97391975530049 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -24738,7 +25833,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 8606711808.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -24750,43 +25845,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2952790016.0 + "value": 5813305344.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 285212672.0 + "value": 101187584.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 4328521728.0 + "value": 50331648.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 318767104.0 + "value": 986185728.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 50331648.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 529006592.0 + "value": 897892352.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 21.32509940290335 + "value": 44.0100996023888 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -24798,13 +25893,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.893281094732266 + "value": 28.259209759615306 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.359946741113756 + "value": 0.3415114460695698 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -24816,7 +25911,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 47.0869494866677 + "value": 94.63713159255578 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -24835,30 +25930,47 @@ "time" ], "times": { - "compilation": 22764.635, - "data": 62352.33, - "framework": 520303.485, - "kernel_overhead": 178910.441, - "profiling_overhead": 51921.09, - "profiling_runs": 227119.624, + "compilation_time": 15155.985, + "data": 58026.427, + "framework": 3482989.927, + "kernel_overhead": 1668238.879, + "profiling_overhead": 47743.108, + "profiling_runs": 1708981.513, "runtimes": [ - 7035.136 + 5878.688 ], - "search_algorithm": 29.74, - "validation": 18.091 + "search_algorithm": 32.325, + "validation": 15.167 }, - "timestamp": "2026-01-27 09:26:13 UTC" + "timestamp": "2026-03-13 09:40:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 256 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "1" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 1 }, "correctness": 1, "invalidity": "correct", @@ -24866,61 +25978,61 @@ { "name": "time", "unit": "", - "value": 7026.4 + "value": 6956.864 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 14.814499649416014 + "value": 7.091559289527958 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109348.0 + "value": 9992.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839112.0 + "value": 1873744.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.7798866479232682 + "value": 0.9272932330576963 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2223148.0 + "value": 124361.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100906.0 + "value": 2105209.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.003260781293925 + "value": 47.985931610952925 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 134742016.0 + "value": 134217728.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5603076699208502 + "value": 0.5621981774882934 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -24950,13 +26062,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 86.18955652832886 + "value": 65.02050021532719 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98279995268072 + "value": 99.95307455416726 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -24968,7 +26080,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 8606711808.0 + "value": 8589934592.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -24992,7 +26104,7 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 4328521728.0 + "value": 4311744512.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", @@ -25010,13 +26122,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 529006592.0 + "value": 527958016.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 21.342378478743107 + "value": 21.373269204175543 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -25028,13 +26140,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.91057321319659 + "value": 23.998383587991835 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.377576596266838 + "value": 24.37335833155421 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -25046,7 +26158,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 47.12104610325507 + "value": 47.20033563403627 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -25065,30 +26177,47 @@ "time" ], "times": { - "compilation": 22548.53, - "data": 61242.125, - "framework": 515661.415, - "kernel_overhead": 177552.24, - "profiling_overhead": 50823.182, - "profiling_runs": 226043.868, + "compilation_time": 13999.331, + "data": 58876.247, + "framework": 497764.01600000006, + "kernel_overhead": 171167.444, + "profiling_overhead": 48697.512, + "profiling_runs": 219022.813, "runtimes": [ - 7026.4 + 6956.864 ], - "search_algorithm": 33.434, - "validation": 20.103 + "search_algorithm": 24.447, + "validation": 17.109 }, - "timestamp": "2026-01-27 09:26:13 UTC" + "timestamp": "2026-03-13 09:40:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 39 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "1" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -25096,61 +26225,61 @@ { "name": "time", "unit": "", - "value": 6908.288 + "value": 3690.112 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 14.736864546261222 + "value": 13.177031065194797 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110940.0 + "value": 9112.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1843048.0 + "value": 1873268.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.786935671223272 + "value": 1.7178596261215822 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2227946.0 + "value": 74101.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2104044.0 + "value": 2102140.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.258697936058596 + "value": 45.688046324691534 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 134742016.0 + "value": 67108864.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5631868985714454 + "value": 0.5352173087134678 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -25180,13 +26309,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 92.46661833729829 + "value": 73.13309305457548 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96855575264703 + "value": 99.89190925234382 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -25198,7 +26327,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 8606711808.0 + "value": 6442450944.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -25210,43 +26339,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2952790016.0 + "value": 1509949440.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 285212672.0 + "value": 142606336.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 4328521728.0 + "value": 2164260864.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 318767104.0 + "value": 184549376.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 50331648.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 529006592.0 + "value": 333971456.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 21.454862972783815 + "value": 28.97735510171988 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -25258,13 +26387,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.03686588394612 + "value": 45.72129750252451 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.506335920741947 + "value": 23.307145797185342 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -25276,7 +26405,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 47.36997300150607 + "value": 56.88429270678098 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -25295,30 +26424,47 @@ "time" ], "times": { - "compilation": 22449.403, - "data": 61344.614, - "framework": 512588.283, - "kernel_overhead": 176204.886, - "profiling_overhead": 51150.698, - "profiling_runs": 223888.085, + "compilation_time": 15310.708, + "data": 57917.926, + "framework": 319668.113, + "kernel_overhead": 91985.807, + "profiling_overhead": 47552.856, + "profiling_runs": 122211.524, "runtimes": [ - 6908.288 + 3690.112 ], - "search_algorithm": 36.889, - "validation": 16.793 + "search_algorithm": 34.395, + "validation": 18.469 }, - "timestamp": "2026-01-27 09:26:13 UTC" + "timestamp": "2026-03-13 09:40:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -25326,61 +26472,61 @@ { "name": "time", "unit": "", - "value": 3715.424 + "value": 2324.032 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 28.296723002513936 + "value": 20.908456951094394 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2106580.0 + "value": 5884.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1869920.0 + "value": 1872224.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.322192565968719 + "value": 2.8008081369040703 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2154956.0 + "value": 46697.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100456.0 + "value": 2101073.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.71266666468405 + "value": 37.927957927795156 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 67633152.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5313440506238389 + "value": 0.4442090689069919 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -25410,13 +26556,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 78.89500753202125 + "value": 96.26798087956222 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9167036894262 + "value": 99.92508598010313 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -25428,7 +26574,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6459228160.0 + "value": 5368709120.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -25440,43 +26586,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1509949440.0 + "value": 591396864.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 142606336.0 + "value": 71303168.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2181038080.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 184549376.0 + "value": 146800640.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 335020032.0 + "value": 228327424.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.849465048786982 + "value": 43.23194693631487 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -25488,13 +26634,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.37915815107689 + "value": 75.86851724956774 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.309997253385202 + "value": 19.485761754527655 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -25506,7 +26652,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 56.635887066146736 + "value": 64.53372491391114 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -25525,30 +26671,47 @@ "time" ], "times": { - "compilation": 22823.578, - "data": 61027.107, - "framework": 333238.067, - "kernel_overhead": 95353.359, - "profiling_overhead": 50751.99, - "profiling_runs": 126105.611, + "compilation_time": 14840.252, + "data": 57249.005, + "framework": 190940.385, + "kernel_overhead": 31589.186, + "profiling_overhead": 47343.124, + "profiling_runs": 54759.07, "runtimes": [ - 3715.424 + 2324.032 ], - "search_algorithm": 38.825, - "validation": 16.983 + "search_algorithm": 33.301, + "validation": 16.524 }, - "timestamp": "2026-01-27 09:26:14 UTC" + "timestamp": "2026-03-13 09:40:46 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -25556,61 +26719,61 @@ { "name": "time", "unit": "", - "value": 3689.728 + "value": 2008.544 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 28.246607855827605 + "value": 24.971648185483872 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097792.0 + "value": 488.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1868556.0 + "value": 1870284.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.3196487064455327 + "value": 3.1572367538532986 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2149422.0 + "value": 35634.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098982.0 + "value": 2099304.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.746751146194235 + "value": 21.53325312940483 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 67633152.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5317641501794482 + "value": 0.2521377487123544 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -25640,13 +26803,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 93.8569839203346 + "value": 98.14832081805851 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95462337673156 + "value": 99.90158339397559 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -25658,7 +26821,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6459228160.0 + "value": 4831838208.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -25670,25 +26833,766 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1509949440.0 + "value": 564133888.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 142606336.0 + "value": 69206016.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2181038080.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 184549376.0 + "value": 115343360.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", + "type": "Double", + "unit": "", + "value": 6291456.0 + }, + { + "name": "smsp__inst_executed.sum", + "type": "Double", + "unit": "", + "value": 192282624.0 + }, + { + "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 43.620422469694994 + }, + { + "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 86.14780197001382 + }, + { + "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 11.231183166989107 + }, + { + "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 61.70953283958999 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.ratio", + "type": "Double", + "unit": "", + "value": 32.0 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.pct", + "type": "Double", + "unit": "", + "value": 100.0 + } + ], + "objectives": [ + "time" + ], + "times": { + "compilation_time": 14767.924, + "data": 57261.038, + "framework": 186393.05, + "kernel_overhead": 30138.221, + "profiling_overhead": 47270.288, + "profiling_runs": 51723.503, + "runtimes": [ + 2008.544 + ], + "search_algorithm": 26.089, + "validation": 15.662 + }, + "timestamp": "2026-03-13 09:40:46 UTC" + }, + { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, + "configuration": { + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 + }, + "correctness": 1, + "invalidity": "correct", + "measurements": [ + { + "name": "time", + "unit": "", + "value": 1885.536 + }, + { + "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 26.541428882992257 + }, + { + "name": "dram__sectors_read.sum", + "type": "Double", + "unit": "", + "value": 216.0 + }, + { + "name": "dram__sectors_write.sum", + "type": "Double", + "unit": "", + "value": 1869048.0 + }, + { + "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 3.3561356533942335 + }, + { + "name": "lts__t_sectors_op_read.sum", + "type": "Double", + "unit": "", + "value": 31256.0 + }, + { + "name": "lts__t_sectors_op_write.sum", + "type": "Double", + "unit": "", + "value": 2098991.0 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 11.450177538596641 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", + "type": "Double", + "unit": "", + "value": 8388608.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 0.13410566321405698 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "sm__warps_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 98.29599356027428 + }, + { + "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 99.92646008795883 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", + "type": "Double", + "unit": "", + "value": 4563402752.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", + "type": "Double", + "unit": "", + "value": 209715200.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", + "type": "Double", + "unit": "", + "value": 67108864.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", + "type": "Double", + "unit": "", + "value": 285212672.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", + "type": "Double", + "unit": "", + "value": 558891008.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", + "type": "Double", + "unit": "", + "value": 3145728.0 + }, + { + "name": "smsp__inst_executed.sum", + "type": "Double", + "unit": "", + "value": 194805760.0 + }, + { + "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 37.779530933833634 + }, + { + "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 91.61684103177258 + }, + { + "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 6.151033028256216 + }, + { + "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 66.48880625743924 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.ratio", + "type": "Double", + "unit": "", + "value": 32.0 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.pct", + "type": "Double", + "unit": "", + "value": 100.0 + } + ], + "objectives": [ + "time" + ], + "times": { + "compilation_time": 15027.008, + "data": 57492.809, + "framework": 194400.51799999998, + "kernel_overhead": 34031.83, + "profiling_overhead": 47886.766, + "profiling_runs": 54989.113, + "runtimes": [ + 1885.536 + ], + "search_algorithm": 20.941, + "validation": 15.338 + }, + "timestamp": "2026-03-13 09:40:46 UTC" + }, + { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, + "configuration": { + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 + }, + "correctness": 1, + "invalidity": "correct", + "measurements": [ + { + "name": "time", + "unit": "", + "value": 1784.352 + }, + { + "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 27.78061540366518 + }, + { + "name": "dram__sectors_read.sum", + "type": "Double", + "unit": "", + "value": 532.0 + }, + { + "name": "dram__sectors_write.sum", + "type": "Double", + "unit": "", + "value": 1866112.0 + }, + { + "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 3.4999918462164867 + }, + { + "name": "lts__t_sectors_op_read.sum", + "type": "Double", + "unit": "", + "value": 29154.0 + }, + { + "name": "lts__t_sectors_op_write.sum", + "type": "Double", + "unit": "", + "value": 2099634.0 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 5.930354155752191 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", + "type": "Double", + "unit": "", + "value": 4194304.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 0.0694532209270332 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "sm__warps_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 81.7165734520455 + }, + { + "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 99.92027405535595 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", + "type": "Double", + "unit": "", + "value": 4429185024.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", + "type": "Double", + "unit": "", + "value": 138412032.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", + "type": "Double", + "unit": "", + "value": 33554432.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", + "type": "Double", + "unit": "", + "value": 150994944.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", + "type": "Double", + "unit": "", + "value": 283639808.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", + "type": "Double", + "unit": "", + "value": 1572864.0 + }, + { + "name": "smsp__inst_executed.sum", + "type": "Double", + "unit": "", + "value": 165953536.0 + }, + { + "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 37.6418134163153 + }, + { + "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 94.90245952138653 + }, + { + "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 3.371168911221128 + }, + { + "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 58.67238615804046 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.ratio", + "type": "Double", + "unit": "", + "value": 32.0 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.pct", + "type": "Double", + "unit": "", + "value": 100.0 + } + ], + "objectives": [ + "time" + ], + "times": { + "compilation_time": 15493.764, + "data": 57389.514, + "framework": 168132.216, + "kernel_overhead": 21212.9, + "profiling_overhead": 47844.708, + "profiling_runs": 41685.094, + "runtimes": [ + 1784.352 + ], + "search_algorithm": 24.405, + "validation": 15.25 + }, + "timestamp": "2026-03-13 09:40:46 UTC" + }, + { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 22 + }, + "configuration": { + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 + }, + "correctness": 1, + "invalidity": "correct", + "measurements": [ + { + "name": "time", + "unit": "", + "value": 6246.688 + }, + { + "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 7.836189739637224 + }, + { + "name": "dram__sectors_read.sum", + "type": "Double", + "unit": "", + "value": 22820.0 + }, + { + "name": "dram__sectors_write.sum", + "type": "Double", + "unit": "", + "value": 1877204.0 + }, + { + "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 1.0214725938776774 + }, + { + "name": "lts__t_sectors_op_read.sum", + "type": "Double", + "unit": "", + "value": 133052.0 + }, + { + "name": "lts__t_sectors_op_write.sum", + "type": "Double", + "unit": "", + "value": 2108568.0 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 26.284808788742293 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", + "type": "Double", + "unit": "", + "value": 67108864.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 0.30795683278752845 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "sm__warps_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 98.28837896116802 + }, + { + "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 99.97143879253004 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", + "type": "Double", + "unit": "", + "value": 6979321856.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", + "type": "Double", + "unit": "", + "value": 10880024576.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", + "type": "Double", + "unit": "", + "value": 1073741824.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", + "type": "Double", + "unit": "", + "value": 2164260864.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", + "type": "Double", + "unit": "", + "value": 2248146944.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -25700,13 +27604,260 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 335020032.0 + "value": 816054272.0 + }, + { + "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 43.02333002957223 + }, + { + "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 26.286490803710784 + }, + { + "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 13.39994941361038 + }, + { + "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 79.91229963766371 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.ratio", + "type": "Double", + "unit": "", + "value": 32.0 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.pct", + "type": "Double", + "unit": "", + "value": 100.0 + } + ], + "objectives": [ + "time" + ], + "times": { + "compilation_time": 15765.185, + "data": 57693.839, + "framework": 2169476.482, + "kernel_overhead": 1009587.033, + "profiling_overhead": 47952.091, + "profiling_runs": 1054243.519, + "runtimes": [ + 6246.688 + ], + "search_algorithm": 24.394, + "validation": 15.474 + }, + "timestamp": "2026-03-13 09:40:47 UTC" + }, + { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 25 + }, + "configuration": { + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 + }, + "correctness": 1, + "invalidity": "correct", + "measurements": [ + { + "name": "time", + "unit": "", + "value": 6392.16 + }, + { + "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 7.8096528910735845 + }, + { + "name": "dram__sectors_read.sum", + "type": "Double", + "unit": "", + "value": 12836.0 + }, + { + "name": "dram__sectors_write.sum", + "type": "Double", + "unit": "", + "value": 1874356.0 + }, + { + "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 1.0190165212624498 + }, + { + "name": "lts__t_sectors_op_read.sum", + "type": "Double", + "unit": "", + "value": 118847.0 + }, + { + "name": "lts__t_sectors_op_write.sum", + "type": "Double", + "unit": "", + "value": 2109336.0 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 13.383773483256151 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", + "type": "Double", + "unit": "", + "value": 33554432.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 0.15681206483032908 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "sm__warps_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 98.79373301786582 + }, + { + "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 99.97551994882552 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", + "type": "Double", + "unit": "", + "value": 5637144576.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", + "type": "Double", + "unit": "", + "value": 10854858752.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", + "type": "Double", + "unit": "", + "value": 1073741824.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", + "type": "Double", + "unit": "", + "value": 1090519040.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", + "type": "Double", + "unit": "", + "value": 1392508928.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", + "type": "Double", + "unit": "", + "value": 12582912.0 + }, + { + "name": "smsp__inst_executed.sum", + "type": "Double", + "unit": "", + "value": 837287936.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.861640347759447 + "value": 36.232821485889374 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -25718,13 +27869,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.39780747404921 + "value": 26.769145498226433 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.319576886083873 + "value": 6.87527858011089 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -25736,7 +27887,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 56.65918874348026 + "value": 83.4971041779249 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -25755,30 +27906,47 @@ "time" ], "times": { - "compilation": 22256.442, - "data": 60527.034, - "framework": 329791.705, - "kernel_overhead": 94072.858, - "profiling_overhead": 50387.787, - "profiling_runs": 124804.026, + "compilation_time": 14440.326, + "data": 58278.51, + "framework": 2754750.534, + "kernel_overhead": 1302327.149, + "profiling_overhead": 48199.578, + "profiling_runs": 1345945.297, "runtimes": [ - 3689.728 + 6392.16 ], - "search_algorithm": 30.34, - "validation": 16.898 + "search_algorithm": 30.127, + "validation": 15.414 }, - "timestamp": "2026-01-27 09:26:14 UTC" + "timestamp": "2026-03-13 09:40:49 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 32, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -25786,61 +27954,61 @@ { "name": "time", "unit": "", - "value": 3647.744 + "value": 8159.872 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 28.12676463623447 + "value": 6.19687734698657 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109592.0 + "value": 27536.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1843312.0 + "value": 1936888.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.339423251173719 + "value": 48.497669266090185 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2174930.0 + "value": 763505.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102892.0 + "value": 138421065.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.7150028910367 + "value": 5.128552136146367 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 67633152.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5313036163117085 + "value": 0.05993807764419942 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -25858,25 +28026,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 78.39865865637181 + "value": 93.20929482105498 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.8961127169075 + "value": 99.30702062354234 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -25888,7 +28056,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6459228160.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -25900,43 +28068,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1509949440.0 + "value": 4250927104.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 142606336.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2181038080.0 + "value": 2722103296.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 184549376.0 + "value": 289406976.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 335020032.0 + "value": 420413440.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.85161600364488 + "value": 12.38187895391247 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -25948,13 +28116,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.38505789551006 + "value": 20.60162887520626 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.31302778617021 + "value": 13.087265218087571 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -25966,7 +28134,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 56.643248438128815 + "value": 32.26583497650618 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -25985,30 +28153,47 @@ "time" ], "times": { - "compilation": 24698.122, - "data": 62071.893, - "framework": 333937.608, - "kernel_overhead": 95095.053, - "profiling_overhead": 51044.098, - "profiling_runs": 125726.564, + "compilation_time": 14000.099, + "data": 58775.641, + "framework": 928984.1769999999, + "kernel_overhead": 383484.432, + "profiling_overhead": 48639.899, + "profiling_runs": 438084.205, "runtimes": [ - 3647.744 + 8159.872 ], - "search_algorithm": 33.455, - "validation": 21.326 + "search_algorithm": 25.951, + "validation": 14.318 }, - "timestamp": "2026-01-27 09:26:14 UTC" + "timestamp": "2026-03-13 09:40:49 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 34 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -26016,61 +28201,61 @@ { "name": "time", "unit": "", - "value": 3874.4 + "value": 8326.815 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.439386015666727 + "value": 6.1705179750351045 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2105984.0 + "value": 6740.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839668.0 + "value": 2029448.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.3493587572195014 + "value": 54.24772351483889 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2169775.0 + "value": 17211968.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101461.0 + "value": 138425089.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.744860923097875 + "value": 2.576918862712292 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 67633152.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.531707833944502 + "value": 0.02992657613909857 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -26088,25 +28273,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 93.77501175740768 + "value": 88.78623432607272 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9468947646434 + "value": 99.07521113847291 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -26118,7 +28303,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6459228160.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -26130,43 +28315,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1509949440.0 + "value": 3769630720.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 142606336.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2181038080.0 + "value": 2453667840.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 184549376.0 + "value": 144703488.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 335020032.0 + "value": 379617280.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.86008237705141 + "value": 10.124178555010122 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -26178,13 +28363,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.396509753279005 + "value": 20.62057273748397 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.31891028342261 + "value": 11.795410625958237 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -26196,7 +28381,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 56.65756911006299 + "value": 29.16164362653782 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -26215,30 +28400,47 @@ "time" ], "times": { - "compilation": 24153.813, - "data": 61606.659, - "framework": 331461.613, - "kernel_overhead": 93772.547, - "profiling_overhead": 50944.498, - "profiling_runs": 125137.909, + "compilation_time": 14715.837, + "data": 59351.273, + "framework": 895376.1560000001, + "kernel_overhead": 365622.03, + "profiling_overhead": 49096.226, + "profiling_runs": 421306.627, "runtimes": [ - 3874.4 + 8326.815 ], - "search_algorithm": 33.304, - "validation": 19.688 + "search_algorithm": 26.506, + "validation": 16.315 }, - "timestamp": "2026-01-27 09:26:14 UTC" + "timestamp": "2026-03-13 09:40:50 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -26246,61 +28448,61 @@ { "name": "time", "unit": "", - "value": 3669.792 + "value": 9648.192 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 28.18861177333578 + "value": 5.675440261778125 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097848.0 + "value": 6100.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837192.0 + "value": 2299440.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.3446875923765362 + "value": 74.71883278111405 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2156893.0 + "value": 108341756.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098973.0 + "value": 138417100.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.74793350461189 + "value": 1.128991814877712 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 67633152.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.5317793961557972 + "value": 0.013096344632014244 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -26318,25 +28520,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 94.6212967145825 + "value": 92.09760452940903 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.94871621072991 + "value": 99.53536486258409 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -26348,7 +28550,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6459228160.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -26360,43 +28562,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1509949440.0 + "value": 3528982528.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 142606336.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2181038080.0 + "value": 2319450112.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 184549376.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 335020032.0 + "value": 359219200.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.864332642636192 + "value": 7.8307165863572 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -26408,13 +28610,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 45.40179223141415 + "value": 17.96434452779269 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.32162374387094 + "value": 9.708026516667267 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -26426,7 +28628,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 56.66419950779592 + "value": 24.04011343549971 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -26445,30 +28647,47 @@ "time" ], "times": { - "compilation": 25614.132, - "data": 60135.216, - "framework": 329853.397, - "kernel_overhead": 94765.643, - "profiling_overhead": 50065.833, - "profiling_runs": 124886.705, + "compilation_time": 14392.754, + "data": 58429.02, + "framework": 877806.395, + "kernel_overhead": 353870.142, + "profiling_overhead": 48339.623, + "profiling_runs": 417167.61, "runtimes": [ - 3669.792 + 9648.192 ], - "search_algorithm": 36.866, - "validation": 21.569 + "search_algorithm": 27.976, + "validation": 15.225 }, - "timestamp": "2026-01-27 09:26:14 UTC" + "timestamp": "2026-03-13 09:40:50 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 26 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -26476,61 +28695,61 @@ { "name": "time", "unit": "", - "value": 2227.104 + "value": 4118.144 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 47.40238129748863 + "value": 12.185359198500507 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103052.0 + "value": 8892.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1873712.0 + "value": 1871732.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.527479340741403 + "value": 1.55819996509876 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2127325.0 + "value": 76219.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2107031.0 + "value": 2100796.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.377380783446675 + "value": 20.894930416119603 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.442455252175729 + "value": 0.24476922313857963 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -26560,13 +28779,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 96.59439773776062 + "value": 98.6122816945135 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.90735344885012 + "value": 99.95412506755157 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -26578,7 +28797,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5385486336.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -26590,25 +28809,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 591396864.0 + "value": 5221908480.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 71303168.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 146800640.0 + "value": 2193620992.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -26620,13 +28839,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 229376000.0 + "value": 535298048.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 43.216044764767446 + "value": 39.60904172433298 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -26638,13 +28857,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 75.58238751329891 + "value": 41.7931199812104 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 19.70751705669024 + "value": 10.733975151424156 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -26656,7 +28875,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 64.58558229163997 + "value": 83.34202787349986 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -26675,30 +28894,47 @@ "time" ], "times": { - "compilation": 24745.366, - "data": 67331.57, - "framework": 220604.3, - "kernel_overhead": 36462.417, - "profiling_overhead": 57099.593, - "profiling_runs": 59710.72, + "compilation_time": 14649.823, + "data": 57822.417, + "framework": 1383010.094, + "kernel_overhead": 623065.003, + "profiling_overhead": 47501.27, + "profiling_runs": 654621.404, "runtimes": [ - 2227.104 + 4118.144 ], - "search_algorithm": 35.002, - "validation": 21.751 + "search_algorithm": 25.482, + "validation": 17.22 }, - "timestamp": "2026-01-27 09:26:15 UTC" + "timestamp": "2026-03-13 09:40:51 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 30 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -26706,61 +28942,61 @@ { "name": "time", "unit": "", - "value": 2243.2 + "value": 4286.4 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 46.99424404490973 + "value": 11.526096300970108 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097380.0 + "value": 476.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1867280.0 + "value": 1870108.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.485041632510392 + "value": 1.4779955857288525 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2118814.0 + "value": 72303.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098982.0 + "value": 2099100.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.37824776154798 + "value": 9.81808717905584 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.44253870930711525 + "value": 0.11502286837716381 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -26790,13 +29026,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.11724651586354 + "value": 98.92833054832015 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9125433573501 + "value": 99.96532619930389 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -26808,7 +29044,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5385486336.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -26820,43 +29056,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 591396864.0 + "value": 5863636992.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 71303168.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 146800640.0 + "value": 1373634560.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 229376000.0 + "value": 621477888.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 43.22219377794553 + "value": 43.263528391220646 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -26868,13 +29104,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 75.59271724100114 + "value": 39.27475712374755 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 19.710210452487605 + "value": 5.120293042988572 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -26886,7 +29122,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 64.59446091831055 + "value": 90.92887003521024 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -26905,30 +29141,47 @@ "time" ], "times": { - "compilation": 24818.291, - "data": 60824.515, - "framework": 206011.906, - "kernel_overhead": 35679.019, - "profiling_overhead": 50414.61, - "profiling_runs": 59093.762, + "compilation_time": 14859.68, + "data": 58589.051, + "framework": 1896046.875, + "kernel_overhead": 878184.64, + "profiling_overhead": 48406.804, + "profiling_runs": 910866.38, "runtimes": [ - 2243.2 + 4286.4 ], - "search_algorithm": 37.835, - "validation": 24.075 + "search_algorithm": 25.964, + "validation": 15.025 }, - "timestamp": "2026-01-27 09:26:15 UTC" + "timestamp": "2026-03-13 09:40:52 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -26936,61 +29189,61 @@ { "name": "time", "unit": "", - "value": 2286.592 + "value": 7028.928 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 46.21789486317811 + "value": 7.110933968551393 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103584.0 + "value": 13308.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839712.0 + "value": 1871424.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.492701716602163 + "value": 0.9291264879970522 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2141445.0 + "value": 125969.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100481.0 + "value": 2102048.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.393860933541916 + "value": 3.0411083640685184 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.44275779953913624 + "value": 0.035631692523083104 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -27020,13 +29273,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 96.34868329403355 + "value": 98.84925407232556 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.92449524511885 + "value": 99.97812555013023 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -27038,7 +29291,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5385486336.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -27050,43 +29303,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 591396864.0 + "value": 8738832384.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 71303168.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 146800640.0 + "value": 1157627904.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 229376000.0 + "value": 970653696.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 43.238428726487285 + "value": 36.47693967543911 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -27098,13 +29351,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 75.62109531066534 + "value": 24.329890792189442 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 19.717609812448874 + "value": 1.6334765546513907 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -27116,7 +29369,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 64.61865719896913 + "value": 87.97640682830651 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -27135,30 +29388,47 @@ "time" ], "times": { - "compilation": 25016.607, - "data": 60750.75, - "framework": 205474.736, - "kernel_overhead": 35492.608, - "profiling_overhead": 50324.931, - "profiling_runs": 58906.447, + "compilation_time": 14534.173, + "data": 59058.714, + "framework": 3133892.807, + "kernel_overhead": 1490003.832, + "profiling_overhead": 48707.027, + "profiling_runs": 1536123.234, "runtimes": [ - 2286.592 + 7028.928 ], - "search_algorithm": 31.257, - "validation": 19.702 + "search_algorithm": 24.24, + "validation": 14.313 }, - "timestamp": "2026-01-27 09:26:15 UTC" + "timestamp": "2026-03-13 09:40:54 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -27166,61 +29436,308 @@ { "name": "time", "unit": "", - "value": 2227.776 + "value": 9622.432 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 45.97497912130522 + "value": 6.0271896337156505 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103360.0 + "value": 16820.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1842632.0 + "value": 2276868.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.470672578879817 + "value": 81.83999645906262 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2142422.0 + "value": 133237647.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100630.0 + "value": 138417423.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.39571321899251 + "value": 1.1020887542971285 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.4427572012902221 + "value": 0.012786754254545857 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", + "type": "Double", + "unit": "", + "value": 136314880.0 + }, + { + "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", + "type": "Double", + "unit": "", + "value": 136314880.0 + }, + { + "name": "sm__warps_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 98.89438065367416 + }, + { + "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 98.83071464324571 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", + "type": "Double", + "unit": "", + "value": 4462739456.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", + "type": "Double", + "unit": "", + "value": 1918369792.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", + "type": "Double", + "unit": "", + "value": 536870912.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", + "type": "Double", + "unit": "", + "value": 1245708288.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", + "type": "Double", + "unit": "", + "value": 72351744.0 + }, + { + "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", + "type": "Double", + "unit": "", + "value": 1572864.0 + }, + { + "name": "smsp__inst_executed.sum", + "type": "Double", + "unit": "", + "value": 258555904.0 + }, + { + "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 7.698805569295245 + }, + { + "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 17.664732944503108 + }, + { + "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 5.129931600948838 + }, + { + "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 0.0 + }, + { + "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 17.0148896951419 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.ratio", + "type": "Double", + "unit": "", + "value": 32.0 + }, + { + "name": "smsp__thread_inst_executed_per_inst_executed.pct", + "type": "Double", + "unit": "", + "value": 100.0 + } + ], + "objectives": [ + "time" + ], + "times": { + "compilation_time": 14543.556, + "data": 60412.184, + "framework": 557526.919, + "kernel_overhead": 192495.484, + "profiling_overhead": 50171.465, + "profiling_runs": 254447.786, + "runtimes": [ + 9622.432 + ], + "search_algorithm": 35.059, + "validation": 17.073 + }, + "timestamp": "2026-03-13 09:40:54 UTC" + }, + { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, + "configuration": { + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 + }, + "correctness": 1, + "invalidity": "correct", + "measurements": [ + { + "name": "time", + "unit": "", + "value": 2853.792 + }, + { + "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 17.237266289284538 + }, + { + "name": "dram__sectors_read.sum", + "type": "Double", + "unit": "", + "value": 5772.0 + }, + { + "name": "dram__sectors_write.sum", + "type": "Double", + "unit": "", + "value": 1872908.0 + }, + { + "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 2.2231514068603184 + }, + { + "name": "lts__t_sectors_op_read.sum", + "type": "Double", + "unit": "", + "value": 57304.0 + }, + { + "name": "lts__t_sectors_op_write.sum", + "type": "Double", + "unit": "", + "value": 2107697.0 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", + "type": "Double", + "unit": "", + "value": 14.85870108071237 + }, + { + "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", + "type": "Double", + "unit": "", + "value": 16777216.0 + }, + { + "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", + "type": "Double", + "unit": "", + "value": 0.17403877674024437 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -27250,13 +29767,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.66435442012794 + "value": 98.84418834915762 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.92053861901766 + "value": 99.94343128963116 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -27268,7 +29785,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5385486336.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -27280,43 +29797,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 591396864.0 + "value": 3042967552.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 71303168.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 146800640.0 + "value": 1644167168.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 229376000.0 + "value": 399179776.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 43.240432905694384 + "value": 45.94277900113985 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -27328,13 +29845,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 75.6239875517271 + "value": 59.4388595903316 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 19.718363941710095 + "value": 7.749109136044208 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -27346,7 +29863,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 64.62118159905617 + "value": 88.38995494257121 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -27365,30 +29882,47 @@ "time" ], "times": { - "compilation": 25105.473, - "data": 60110.826, - "framework": 204843.752, - "kernel_overhead": 35675.296, - "profiling_overhead": 50109.825, - "profiling_runs": 58947.805, + "compilation_time": 15056.467, + "data": 58768.045, + "framework": 1160671.72, + "kernel_overhead": 513648.29, + "profiling_overhead": 48537.599, + "profiling_runs": 539717.786, "runtimes": [ - 2227.776 + 2853.792 ], - "search_algorithm": 35.152, - "validation": 20.563 + "search_algorithm": 37.615, + "validation": 17.587 }, - "timestamp": "2026-01-27 09:26:15 UTC" + "timestamp": "2026-03-13 09:40:55 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -27396,61 +29930,61 @@ { "name": "time", "unit": "", - "value": 2224.416 + "value": 3615.264 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 46.7203986377596 + "value": 13.93466396363775 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097352.0 + "value": 2488.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1836072.0 + "value": 1869332.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.514706955445585 + "value": 1.7847536960234267 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2132920.0 + "value": 60632.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098982.0 + "value": 2099650.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.462372347784104 + "value": 6.037428798517981 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.4434512182909192 + "value": 0.07072681239183483 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -27480,13 +30014,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 96.34065665184595 + "value": 98.70608642477352 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9087959345794 + "value": 99.96459396518638 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -27498,7 +30032,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5385486336.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -27510,43 +30044,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 591396864.0 + "value": 3301965824.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 71303168.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 146800640.0 + "value": 1429209088.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 229376000.0 + "value": 513540096.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 43.31259001296101 + "value": 42.22124827013381 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -27558,13 +30092,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 75.75142963842802 + "value": 48.29993835248061 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 19.751593470176058 + "value": 3.2427937126299238 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -27576,7 +30110,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 64.73008167818027 + "value": 92.40266444487987 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -27595,30 +30129,47 @@ "time" ], "times": { - "compilation": 26077.985, - "data": 61222.884, - "framework": 206775.74099999998, - "kernel_overhead": 35799.381, - "profiling_overhead": 50509.746, - "profiling_runs": 59243.73, + "compilation_time": 14226.304, + "data": 57655.842, + "framework": 1844124.62, + "kernel_overhead": 854819.172, + "profiling_overhead": 47324.53, + "profiling_runs": 884325.076, "runtimes": [ - 2224.416 + 3615.264 ], - "search_algorithm": 35.211, - "validation": 22.052 + "search_algorithm": 25.862, + "validation": 16.271 }, - "timestamp": "2026-01-27 09:26:15 UTC" + "timestamp": "2026-03-13 09:40:55 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 56 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -27626,61 +30177,61 @@ { "name": "time", "unit": "", - "value": 2003.52 + "value": 5780.352 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 52.989339593596064 + "value": 8.68103626789729 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098056.0 + "value": 10588.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1867344.0 + "value": 1867516.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.224973343480999 + "value": 1.1262971238006843 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2117144.0 + "value": 102050.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099110.0 + "value": 2102171.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.09051461623991 + "value": 1.862821978920939 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2509218898412472 + "value": 0.021826389191589125 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -27710,13 +30261,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.7580262439909 + "value": 73.90563897840381 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9140180807518 + "value": 99.9777318830934 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -27728,7 +30279,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4848615424.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -27740,43 +30291,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 564133888.0 + "value": 4806148096.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 69206016.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 115343360.0 + "value": 1222115328.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 193331200.0 + "value": 868564992.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 43.57270160426023 + "value": 42.34344863971165 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -27788,13 +30339,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.72171023748695 + "value": 29.806934152526377 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.510483552396929 + "value": 1.0588156540997529 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -27806,7 +30357,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 61.73916838327478 + "value": 96.44556919790958 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -27825,30 +30376,47 @@ "time" ], "times": { - "compilation": 26382.749, - "data": 59961.94, - "framework": 199872.67299999998, - "kernel_overhead": 34111.715, - "profiling_overhead": 49746.292, - "profiling_runs": 56052.726, + "compilation_time": 14675.958, + "data": 57963.312, + "framework": 3325535.6059999997, + "kernel_overhead": 1589968.295, + "profiling_overhead": 47880.774, + "profiling_runs": 1629723.225, "runtimes": [ - 2003.52 + 5780.352 ], - "search_algorithm": 35.301, - "validation": 19.471 + "search_algorithm": 27.75, + "validation": 18.505 }, - "timestamp": "2026-01-27 09:26:15 UTC" + "timestamp": "2026-03-13 09:40:57 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -27856,61 +30424,61 @@ { "name": "time", "unit": "", - "value": 1950.496 + "value": 3436.736 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 53.14600220603325 + "value": 14.525552226704635 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2105652.0 + "value": 8940.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1872288.0 + "value": 1871452.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.252448325682525 + "value": 1.8516979438790577 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2124781.0 + "value": 66562.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101420.0 + "value": 2101448.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.09142455059159 + "value": 6.226675273820059 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2508727105798032 + "value": 0.07295366830553172 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -27940,13 +30508,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.18714709197381 + "value": 98.7372014713465 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98014536856779 + "value": 99.9748663317574 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -27958,7 +30526,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4848615424.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -27970,43 +30538,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 564133888.0 + "value": 2563768320.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 69206016.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 115343360.0 + "value": 1362100224.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 193331200.0 + "value": 484179968.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 43.53508996036249 + "value": 42.76827871426585 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -28018,13 +30586,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.64822368371345 + "value": 49.81555804033405 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.500615973154883 + "value": 3.3445504055400055 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -28036,7 +30604,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 61.68629874769482 + "value": 89.8536166277735 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -28055,30 +30623,47 @@ "time" ], "times": { - "compilation": 27289.909, - "data": 60155.051, - "framework": 199419.195, - "kernel_overhead": 33850.831, - "profiling_overhead": 49801.982, - "profiling_runs": 55611.331, + "compilation_time": 15196.694, + "data": 57398.952, + "framework": 1719311.7880000002, + "kernel_overhead": 792956.828, + "profiling_overhead": 47394.221, + "profiling_runs": 821561.787, "runtimes": [ - 1950.496 + 3436.736 ], - "search_algorithm": 31.356, - "validation": 20.346 + "search_algorithm": 26.701, + "validation": 16.324 }, - "timestamp": "2026-01-27 09:26:16 UTC" + "timestamp": "2026-03-13 09:40:58 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -28086,61 +30671,61 @@ { "name": "time", "unit": "", - "value": 1953.536 + "value": 6490.752 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 52.78193121058404 + "value": 7.756123924982411 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102712.0 + "value": 18312.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839600.0 + "value": 1872564.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.257521543565299 + "value": 1.0137894446124338 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2137877.0 + "value": 121957.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103722.0 + "value": 2106415.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.116092669519464 + "value": 1.66062747935063 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2511858145427756 + "value": 0.01945603349680434 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -28170,13 +30755,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.23324278305016 + "value": 82.03314900107425 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.93503354765622 + "value": 99.97223402511291 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -28188,7 +30773,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4848615424.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -28200,43 +30785,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 564133888.0 + "value": 7219970048.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 69206016.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 115343360.0 + "value": 551026688.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 193331200.0 + "value": 952385536.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 43.60976235749119 + "value": 43.24343689193959 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -28248,13 +30833,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.79382857068634 + "value": 26.571348861688953 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.520167410614622 + "value": 0.9438797019960308 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -28266,7 +30851,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 61.79111513352752 + "value": 94.27315120469798 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -28285,30 +30870,47 @@ "time" ], "times": { - "compilation": 26340.387, - "data": 60031.96, - "framework": 198818.101, - "kernel_overhead": 33506.718, - "profiling_overhead": 49992.767, - "profiling_runs": 55286.656, + "compilation_time": 14907.995, + "data": 57675.395, + "framework": 3691663.864, + "kernel_overhead": 1771533.531, + "profiling_overhead": 47611.512, + "profiling_runs": 1814843.426, "runtimes": [ - 1953.536 + 6490.752 ], - "search_algorithm": 31.173, - "validation": 20.136 + "search_algorithm": 25.866, + "validation": 16.718 }, - "timestamp": "2026-01-27 09:26:16 UTC" + "timestamp": "2026-03-13 09:41:0 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 16, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -28316,61 +30918,61 @@ { "name": "time", "unit": "", - "value": 2023.871 + "value": 6031.904 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 52.42745250740476 + "value": 8.208810571454164 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103036.0 + "value": 6156.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839240.0 + "value": 1870780.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.238501887956158 + "value": 1.0663728645388455 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2136117.0 + "value": 102622.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103552.0 + "value": 2103339.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.117256060330014 + "value": 1.7421164611822262 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.251195904492815 + "value": 0.02041119861209165 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -28400,13 +31002,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.8267354515749 + "value": 82.01709649949342 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.93449815860195 + "value": 99.96729254087963 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -28418,7 +31020,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4848615424.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -28430,43 +31032,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 564133888.0 + "value": 5944901632.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 69206016.0 + "value": 101187584.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 115343360.0 + "value": 1054343168.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 193331200.0 + "value": 913571840.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 43.61172923594635 + "value": 44.06157599950532 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -28478,13 +31080,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.79773449618698 + "value": 27.877207764709343 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.520691887915731 + "value": 0.9902670238684593 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -28496,7 +31098,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 61.79398070824909 + "value": 94.98600407344368 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -28515,30 +31117,47 @@ "time" ], "times": { - "compilation": 26429.369, - "data": 60229.799, - "framework": 197830.326, - "kernel_overhead": 33103.665, - "profiling_overhead": 49654.386, - "profiling_runs": 54842.476, + "compilation_time": 15835.649, + "data": 58642.32, + "framework": 3600016.89, + "kernel_overhead": 1725655.225, + "profiling_overhead": 48519.537, + "profiling_runs": 1767199.808, "runtimes": [ - 2023.871 + 6031.904 ], - "search_algorithm": 31.454, - "validation": 16.638 + "search_algorithm": 76.473, + "validation": 16.496 }, - "timestamp": "2026-01-27 09:26:16 UTC" + "timestamp": "2026-03-13 09:41:2 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 256 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 1 }, "correctness": 1, "invalidity": "correct", @@ -28546,61 +31165,61 @@ { "name": "time", "unit": "", - "value": 1929.536 + "value": 6998.944 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 53.96121589155444 + "value": 7.104357300575638 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098852.0 + "value": 14648.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839816.0 + "value": 1871560.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.3646714392350034 + "value": 0.9286597821081216 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2131155.0 + "value": 129798.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102992.0 + "value": 2102280.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.388302530979452 + "value": 48.012876110896066 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 134217728.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2542666957186231 + "value": 0.562476783757018 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -28630,13 +31249,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.14930458832873 + "value": 80.02497749666871 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.91003935722753 + "value": 99.94021370415773 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -28648,7 +31267,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4848615424.0 + "value": 8589934592.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -28660,43 +31279,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 564133888.0 + "value": 2952790016.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 69206016.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 4311744512.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 115343360.0 + "value": 318767104.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 50331648.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 193331200.0 + "value": 527958016.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.155501564302995 + "value": 21.387433625700748 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -28708,13 +31327,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 86.86784567762422 + "value": 24.01336614242304 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.664383574876297 + "value": 24.3885749883984 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -28726,7 +31345,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 62.5647053677021 + "value": 47.229819978665525 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -28745,30 +31364,47 @@ "time" ], "times": { - "compilation": 25970.017, - "data": 60772.455, - "framework": 201157.421, - "kernel_overhead": 34227.313, - "profiling_overhead": 50280.887, - "profiling_runs": 55876.766, + "compilation_time": 14187.319, + "data": 59699.046, + "framework": 508043.09199999995, + "kernel_overhead": 175445.973, + "profiling_overhead": 49572.419, + "profiling_runs": 223325.654, "runtimes": [ - 1929.536 + 6998.944 ], - "search_algorithm": 37.294, - "validation": 21.727 + "search_algorithm": 34.594, + "validation": 17.037 }, - "timestamp": "2026-01-27 09:26:16 UTC" + "timestamp": "2026-03-13 09:41:2 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 39 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -28776,61 +31412,61 @@ { "name": "time", "unit": "", - "value": 1850.304 + "value": 3703.584 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 56.70493259579457 + "value": 13.380200914867974 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098060.0 + "value": 5572.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1864644.0 + "value": 1870208.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.621156553024347 + "value": 1.7099487647300613 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2117161.0 + "value": 68363.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100312.0 + "value": 2101230.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.047590497715385 + "value": 45.733652647907455 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 67108864.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.1327849614115318 + "value": 0.5357640371006132 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -28860,13 +31496,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.92352878534302 + "value": 92.89026769659606 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.91938234407385 + "value": 99.94642628726379 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -28878,7 +31514,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4580179968.0 + "value": 6442450944.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -28890,43 +31526,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 209715200.0 + "value": 1509949440.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 142606336.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 2164260864.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 558891008.0 + "value": 184549376.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 195854336.0 + "value": 333971456.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.58570409871653 + "value": 28.991046906449906 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -28938,13 +31574,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.72100413724048 + "value": 45.74303740937751 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.445266651351803 + "value": 23.318228054389706 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -28956,7 +31592,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 66.19269265360039 + "value": 56.91137268541131 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -28975,30 +31611,47 @@ "time" ], "times": { - "compilation": 27272.074, - "data": 59560.217, - "framework": 203118.783, - "kernel_overhead": 36352.0, - "profiling_overhead": 49592.773, - "profiling_runs": 57613.793, + "compilation_time": 14637.224, + "data": 57699.907, + "framework": 320098.083, + "kernel_overhead": 91968.069, + "profiling_overhead": 47623.558, + "profiling_runs": 122806.549, "runtimes": [ - 1850.304 + 3703.584 ], - "search_algorithm": 35.902, - "validation": 20.3 + "search_algorithm": 24.719, + "validation": 15.862 }, - "timestamp": "2026-01-27 09:26:16 UTC" + "timestamp": "2026-03-13 09:41:2 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -29006,61 +31659,61 @@ { "name": "time", "unit": "", - "value": 1860.256 + "value": 2230.272 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 56.38355624954477 + "value": 22.278007856512076 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097352.0 + "value": 2400.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1865988.0 + "value": 1870464.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.579030095773924 + "value": 2.813576136040202 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2115651.0 + "value": 41646.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098982.0 + "value": 2099625.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.0492683880869 + "value": 37.92384313564955 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.132781589521406 + "value": 0.44409889765856914 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -29090,13 +31743,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.47793041624257 + "value": 94.67073182844801 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.85297434866729 + "value": 99.88996858455674 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -29108,7 +31761,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4580179968.0 + "value": 5368709120.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -29120,43 +31773,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 209715200.0 + "value": 591396864.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 71303168.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 558891008.0 + "value": 146800640.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 195854336.0 + "value": 228327424.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.610881970169146 + "value": 43.23574401996685 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -29168,13 +31821,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.77903357867241 + "value": 75.87636637363696 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.449389348484782 + "value": 19.487777691666526 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -29186,7 +31839,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 66.23509485267606 + "value": 64.54044886226517 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -29205,30 +31858,47 @@ "time" ], "times": { - "compilation": 27141.27, - "data": 59344.323, - "framework": 203791.869, - "kernel_overhead": 36962.238, - "profiling_overhead": 49314.885, - "profiling_runs": 58170.423, + "compilation_time": 15150.569, + "data": 56475.902, + "framework": 190581.41199999998, + "kernel_overhead": 32275.433, + "profiling_overhead": 46833.468, + "profiling_runs": 54996.609, "runtimes": [ - 1860.256 + 2230.272 ], - "search_algorithm": 34.278, - "validation": 18.574 + "search_algorithm": 24.929, + "validation": 16.73 }, - "timestamp": "2026-01-27 09:26:16 UTC" + "timestamp": "2026-03-13 09:41:3 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -29236,61 +31906,61 @@ { "name": "time", "unit": "", - "value": 1862.848 + "value": 2183.776 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 55.84642876568279 + "value": 24.798216878862586 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103644.0 + "value": 7564.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839096.0 + "value": 1872244.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.628668806407304 + "value": 3.149764851502613 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2135125.0 + "value": 41956.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100518.0 + "value": 2100609.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.066148826588194 + "value": 21.53424425723229 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.13298145770100944 + "value": 0.2521515630349666 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -29320,13 +31990,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.14210093587774 + "value": 97.88676374506385 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.90399244414682 + "value": 99.91062963094825 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -29338,7 +32008,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4580179968.0 + "value": 4831838208.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -29350,43 +32020,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 209715200.0 + "value": 564133888.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 69206016.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 558891008.0 + "value": 115343360.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 195854336.0 + "value": 192282624.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.647643915688064 + "value": 43.61975861920062 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -29398,13 +32068,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.86924980298042 + "value": 86.14472137134345 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.455798753092604 + "value": 11.230781545971046 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -29416,7 +32086,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 66.30085837249112 + "value": 61.70738775567418 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -29435,30 +32105,47 @@ "time" ], "times": { - "compilation": 27034.197, - "data": 59473.151, - "framework": 204077.20299999998, - "kernel_overhead": 36884.17, - "profiling_overhead": 49599.195, - "profiling_runs": 58120.687, + "compilation_time": 14990.453, + "data": 57170.313, + "framework": 186827.064, + "kernel_overhead": 30286.394, + "profiling_overhead": 47208.536, + "profiling_runs": 52161.821, "runtimes": [ - 1862.848 + 2183.776 ], - "search_algorithm": 46.506, - "validation": 19.081 + "search_algorithm": 38.819, + "validation": 17.218 }, - "timestamp": "2026-01-27 09:26:17 UTC" + "timestamp": "2026-03-13 09:41:3 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -29466,61 +32153,61 @@ { "name": "time", "unit": "", - "value": 1914.08 + "value": 1925.696 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 56.176507480386796 + "value": 26.28391298883183 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2101348.0 + "value": 3048.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839816.0 + "value": 1870688.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.643576139243586 + "value": 3.3730135376728914 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2134610.0 + "value": 32367.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103604.0 + "value": 2099464.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.066370006225496 + "value": 11.449363319319117 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.1329963831867375 + "value": 0.13406558805402294 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -29550,13 +32237,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.0469428685529 + "value": 98.13291401373382 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.92014284502079 + "value": 99.89284088068698 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -29568,7 +32255,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4580179968.0 + "value": 4563402752.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -29592,7 +32279,7 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", @@ -29610,13 +32297,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 195854336.0 + "value": 194805760.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.645817787760194 + "value": 37.78015555554653 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -29628,13 +32315,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.86475960070864 + "value": 91.62028760485575 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.455479747023 + "value": 6.15126442659554 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -29646,7 +32333,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 66.29764311935253 + "value": 66.49136896443741 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -29665,30 +32352,47 @@ "time" ], "times": { - "compilation": 27235.084, - "data": 60066.886, - "framework": 204979.579, - "kernel_overhead": 36818.946, - "profiling_overhead": 49976.633, - "profiling_runs": 58117.114, + "compilation_time": 15482.669, + "data": 56967.527, + "framework": 192678.343, + "kernel_overhead": 33704.686, + "profiling_overhead": 47266.098, + "profiling_runs": 54740.032, "runtimes": [ - 1914.08 + 1925.696 ], - "search_algorithm": 44.085, - "validation": 19.047 + "search_algorithm": 33.7, + "validation": 14.178 }, - "timestamp": "2026-01-27 09:26:17 UTC" + "timestamp": "2026-03-13 09:41:3 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -29696,61 +32400,61 @@ { "name": "time", "unit": "", - "value": 1808.512 + "value": 1810.016 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 56.98366998221827 + "value": 27.68950941038361 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098652.0 + "value": 2440.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839184.0 + "value": 1869424.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.779175735349016 + "value": 3.496568884445208 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2127989.0 + "value": 32650.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099302.0 + "value": 2103205.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.399056169437102 + "value": 5.930355581196915 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.13666947131225715 + "value": 0.06944478980970048 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -29780,13 +32484,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.96302412145143 + "value": 81.64995754438033 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.92767865434847 + "value": 99.9127164667793 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -29798,7 +32502,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4580179968.0 + "value": 4429185024.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -29810,43 +32514,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 209715200.0 + "value": 138412032.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 67108864.0 + "value": 33554432.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 558891008.0 + "value": 283639808.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 195854336.0 + "value": 165953536.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.68290873312754 + "value": 37.63968897043487 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -29858,13 +32562,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 93.36721684345278 + "value": 94.89811678279898 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.633266626329287 + "value": 3.3710146464592903 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -29876,7 +32580,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 68.12351763904022 + "value": 58.669701308388746 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -29895,30 +32599,47 @@ "time" ], "times": { - "compilation": 27091.516, - "data": 60290.713, - "framework": 204693.569, - "kernel_overhead": 36979.389, - "profiling_overhead": 49614.866, - "profiling_runs": 57808.601, + "compilation_time": 15483.959, + "data": 58135.221, + "framework": 168817.28900000002, + "kernel_overhead": 20869.132, + "profiling_overhead": 48296.924, + "profiling_runs": 41516.012, "runtimes": [ - 1808.512 + 1810.016 ], - "search_algorithm": 33.352, - "validation": 19.011 + "search_algorithm": 30.763, + "validation": 16.044 }, - "timestamp": "2026-01-27 09:26:17 UTC" + "timestamp": "2026-03-13 09:41:3 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 22 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -29926,61 +32647,61 @@ { "name": "time", "unit": "", - "value": 1800.0 + "value": 6973.472 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 58.76478833820349 + "value": 7.452893973067049 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102340.0 + "value": 6644.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1861848.0 + "value": 1873460.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.867946913635508 + "value": 1.0112051487471492 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2122094.0 + "value": 115719.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100198.0 + "value": 2109299.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.642277354809982 + "value": 26.284687719150483 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 67108864.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06917968477303793 + "value": 0.3079546910884327 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -30010,13 +32731,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.36604452961046 + "value": 97.75577419366488 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.92184948970974 + "value": 99.97101177231865 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -30028,7 +32749,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4445962240.0 + "value": 6979321856.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -30040,43 +32761,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 138412032.0 + "value": 10880024576.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 33554432.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 2164260864.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 283639808.0 + "value": 2248146944.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 167002112.0 + "value": 816054272.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.67390624401305 + "value": 43.023644187259244 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -30088,13 +32809,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 94.52720310170822 + "value": 26.2864202735711 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.727085766827607 + "value": 13.399913459769644 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -30106,7 +32827,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 58.80963495285534 + "value": 79.91210167369344 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -30125,30 +32846,47 @@ "time" ], "times": { - "compilation": 27329.878, - "data": 59851.29, - "framework": 180147.81600000002, - "kernel_overhead": 24813.556, - "profiling_overhead": 49774.189, - "profiling_runs": 45708.781, + "compilation_time": 15553.882, + "data": 57333.913, + "framework": 2169511.873, + "kernel_overhead": 1009381.208, + "profiling_overhead": 47911.838, + "profiling_runs": 1054884.914, "runtimes": [ - 1800.0 + 6973.472 ], - "search_algorithm": 34.712, - "validation": 19.836 + "search_algorithm": 26.78, + "validation": 18.146 }, - "timestamp": "2026-01-27 09:26:17 UTC" + "timestamp": "2026-03-13 09:41:4 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 25 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -30156,61 +32894,61 @@ { "name": "time", "unit": "", - "value": 1779.936 + "value": 6284.32 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 59.10739805640244 + "value": 7.799456196446572 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2099992.0 + "value": 24620.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1870512.0 + "value": 1876840.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.904508279555638 + "value": 1.036195896537624 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2121344.0 + "value": 134866.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103212.0 + "value": 2105399.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.641422855124741 + "value": 13.38374898026212 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06914398086116218 + "value": 0.15680651186000782 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -30240,13 +32978,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.11007669172562 + "value": 98.52430640210879 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9204331464135 + "value": 99.97073087476257 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -30258,7 +32996,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4445962240.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -30270,43 +33008,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 138412032.0 + "value": 10854858752.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 33554432.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 283639808.0 + "value": 1392508928.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 167002112.0 + "value": 837287936.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.65641334068437 + "value": 36.23257970716396 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -30318,13 +33056,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 94.47975643858112 + "value": 26.769479883367143 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.725215006062219 + "value": 6.875364462231989 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -30336,7 +33074,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 58.78011624451872 + "value": 83.4981651272205 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -30355,30 +33093,47 @@ "time" ], "times": { - "compilation": 27524.638, - "data": 60782.663, - "framework": 180862.272, - "kernel_overhead": 24454.314, - "profiling_overhead": 50236.251, - "profiling_runs": 45389.044, + "compilation_time": 14112.929, + "data": 58256.772, + "framework": 2749601.722, + "kernel_overhead": 1299832.146, + "profiling_overhead": 48085.475, + "profiling_runs": 1343427.329, "runtimes": [ - 1779.936 + 6284.32 ], - "search_algorithm": 34.956, - "validation": 20.868 + "search_algorithm": 25.525, + "validation": 14.695 }, - "timestamp": "2026-01-27 09:26:17 UTC" + "timestamp": "2026-03-13 09:41:6 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 32, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -30386,61 +33141,61 @@ { "name": "time", "unit": "", - "value": 1743.296 + "value": 8178.015 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 58.8692638623327 + "value": 6.189229409613588 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102072.0 + "value": 18104.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838872.0 + "value": 1931632.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.934920440151891 + "value": 48.85579725510688 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2131860.0 + "value": 927674.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099878.0 + "value": 138418121.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.657152844642519 + "value": 5.095351362183663 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06930497063506431 + "value": 0.059257180285558556 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -30458,25 +33213,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.53269125028082 + "value": 97.74065293890729 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.91977588685602 + "value": 97.85739577279008 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -30488,7 +33243,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4445962240.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -30500,43 +33255,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 138412032.0 + "value": 4250927104.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 33554432.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 2722103296.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 283639808.0 + "value": 289406976.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 167002112.0 + "value": 420413440.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.74150834839213 + "value": 12.423106915801053 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -30548,13 +33303,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 94.70035909697086 + "value": 20.669312432722727 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.7339130845119124 + "value": 13.130261462388804 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -30566,7 +33321,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 58.91736310451067 + "value": 32.371860706101536 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -30585,30 +33340,47 @@ "time" ], "times": { - "compilation": 28183.933, - "data": 59473.247, - "framework": 178807.112, - "kernel_overhead": 24499.877, - "profiling_overhead": 49653.564, - "profiling_runs": 45180.424, + "compilation_time": 14245.559, + "data": 59050.322, + "framework": 924300.7679999999, + "kernel_overhead": 381068.506, + "profiling_overhead": 48601.672, + "profiling_runs": 435580.268, "runtimes": [ - 1743.296 + 8178.015 ], - "search_algorithm": 32.887, - "validation": 18.844 + "search_algorithm": 25.27, + "validation": 16.187 }, - "timestamp": "2026-01-27 09:26:17 UTC" + "timestamp": "2026-03-13 09:41:6 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 34 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -30616,61 +33388,61 @@ { "name": "time", "unit": "", - "value": 1870.208 + "value": 8367.039 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 58.42777145026575 + "value": 6.139049490767765 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2099992.0 + "value": 22592.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839816.0 + "value": 2023564.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.920566362835337 + "value": 54.50557589469205 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2130165.0 + "value": 18228045.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103068.0 + "value": 138421860.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.657453500671517 + "value": 2.53485035165659 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06930733165306673 + "value": 0.029559202008351482 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -30688,25 +33460,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.47596059718435 + "value": 90.0053258015499 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9251701633742 + "value": 97.9527473465697 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -30718,7 +33490,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4445962240.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -30730,43 +33502,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 138412032.0 + "value": 3769630720.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 33554432.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 2453667840.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 283639808.0 + "value": 144703488.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 167002112.0 + "value": 379617280.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.74225098919493 + "value": 10.114389765273362 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -30778,13 +33550,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 94.6984728628508 + "value": 20.60083300468511 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.733838712732033 + "value": 11.784119074701469 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -30796,7 +33568,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 58.916189593220835 + "value": 29.13374930985101 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -30815,30 +33587,47 @@ "time" ], "times": { - "compilation": 27145.469, - "data": 59696.556, - "framework": 179297.17700000003, - "kernel_overhead": 24445.83, - "profiling_overhead": 49816.497, - "profiling_runs": 45338.294, + "compilation_time": 14794.988, + "data": 58639.431, + "framework": 888559.701, + "kernel_overhead": 363191.664, + "profiling_overhead": 48385.595, + "profiling_runs": 418343.011, "runtimes": [ - 1870.208 + 8367.039 ], - "search_algorithm": 33.692, - "validation": 21.384 + "search_algorithm": 32.72, + "validation": 18.514 }, - "timestamp": "2026-01-27 09:26:18 UTC" + "timestamp": "2026-03-13 09:41:7 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "0", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -30846,61 +33635,61 @@ { "name": "time", "unit": "", - "value": 1747.04 + "value": 9763.488 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 58.885401139013105 + "value": 5.6436501930093925 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103840.0 + "value": 14988.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1840144.0 + "value": 2309264.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 6.951710303821272 + "value": 77.08855189866334 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2133210.0 + "value": 121628827.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103694.0 + "value": 138421568.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.681718518939764 + "value": 1.100986716568427 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06950473127665394 + "value": 0.012832633922758657 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -30918,25 +33707,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.0640922590476 + "value": 95.31394108218996 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.82602853483709 + "value": 99.45515036216894 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -30948,7 +33737,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4445962240.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -30960,25 +33749,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 138412032.0 + "value": 3528982528.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 33554432.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 2319450112.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 283639808.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -30990,13 +33779,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 167002112.0 + "value": 359219200.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 37.89024463778831 + "value": 7.678263995499303 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -31008,13 +33797,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.06250807450934 + "value": 17.61680796359352 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.748192151863589 + "value": 9.520215924661683 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -31026,7 +33815,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 59.142672311476964 + "value": 23.575054109401993 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -31045,30 +33834,47 @@ "time" ], "times": { - "compilation": 27185.912, - "data": 59721.789, - "framework": 179462.00199999998, - "kernel_overhead": 24547.074, - "profiling_overhead": 49692.159, - "profiling_runs": 45500.98, + "compilation_time": 14117.891, + "data": 59458.56, + "framework": 878308.6239999998, + "kernel_overhead": 352937.437, + "profiling_overhead": 48879.404, + "profiling_runs": 417033.223, "runtimes": [ - 1747.04 + 9763.488 ], - "search_algorithm": 32.269, - "validation": 18.74 + "search_algorithm": 24.945, + "validation": 16.943 }, - "timestamp": "2026-01-27 09:26:18 UTC" + "timestamp": "2026-03-13 09:41:7 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 26 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -31076,61 +33882,61 @@ { "name": "time", "unit": "", - "value": 6438.304 + "value": 4178.88 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.10962911747709 + "value": 11.63195267558 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2114244.0 + "value": 10188.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1882140.0 + "value": 1874032.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9604890635516716 + "value": 1.5635577916351944 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2208767.0 + "value": 81318.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2145634.0 + "value": 2102345.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.32567915597991 + "value": 20.89477701003576 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 67633152.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.30603939341911324 + "value": 0.24479242123620076 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -31160,13 +33966,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.15438374295712 + "value": 98.55281582316502 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97009067254479 + "value": 99.96530635060329 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -31178,7 +33984,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6996099072.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -31190,43 +33996,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 10880024576.0 + "value": 5221908480.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2181038080.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 2248146944.0 + "value": 2193620992.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 817102848.0 + "value": 535298048.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.54169364265381 + "value": 39.60907409323793 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -31238,13 +34044,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.123174837668223 + "value": 41.792405868411365 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.418740199817858 + "value": 10.733791741593935 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -31256,7 +34062,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.51785941361392 + "value": 83.34063246746459 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -31275,30 +34081,47 @@ "time" ], "times": { - "compilation": 27366.146, - "data": 60794.494, - "framework": 2163052.159, - "kernel_overhead": 1002881.631, - "profiling_overhead": 50809.513, - "profiling_runs": 1048566.521, + "compilation_time": 14371.914, + "data": 57771.465, + "framework": 1377157.131, + "kernel_overhead": 619625.518, + "profiling_overhead": 47592.833, + "profiling_runs": 652167.315, "runtimes": [ - 6438.304 + 4178.88 ], - "search_algorithm": 28.334, - "validation": 19.28 + "search_algorithm": 30.871, + "validation": 16.35 }, - "timestamp": "2026-01-27 09:26:19 UTC" + "timestamp": "2026-03-13 09:41:8 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 30 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -31306,61 +34129,61 @@ { "name": "time", "unit": "", - "value": 6435.168 + "value": 4284.288 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.00103273306395 + "value": 11.526607827362643 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104008.0 + "value": 432.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1869860.0 + "value": 1870176.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9117618329383548 + "value": 1.4774591308641016 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2197068.0 + "value": 71443.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100589.0 + "value": 2099172.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.326201356829586 + "value": 9.818085455382157 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 67633152.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.3060282084265532 + "value": 0.11502635693049244 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -31390,13 +34213,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.88367669500019 + "value": 98.80609597830465 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96459338553325 + "value": 99.96616528565768 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -31408,7 +34231,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6996099072.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -31420,43 +34243,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 10880024576.0 + "value": 5863636992.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2181038080.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 2248146944.0 + "value": 1373634560.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 817102848.0 + "value": 621477888.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.54267972387353 + "value": 43.26458975628555 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -31468,13 +34291,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.123656621450447 + "value": 39.27561862564293 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.418987678596617 + "value": 5.12040535793294 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -31486,7 +34309,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.51934297412623 + "value": 90.93089209021228 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -31505,30 +34328,47 @@ "time" ], "times": { - "compilation": 17092.087, - "data": 64028.585, - "framework": 2167931.465, - "kernel_overhead": 1002409.582, - "profiling_overhead": 53669.152, - "profiling_runs": 1047824.146, + "compilation_time": 16370.808, + "data": 58272.684, + "framework": 1888064.4, + "kernel_overhead": 874479.164, + "profiling_overhead": 48256.965, + "profiling_runs": 907055.587, "runtimes": [ - 6435.168 + 4284.288 ], - "search_algorithm": 32.984, - "validation": 18.751 + "search_algorithm": 25.849, + "validation": 16.162 }, - "timestamp": "2026-01-27 09:26:20 UTC" + "timestamp": "2026-03-13 09:41:9 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -31536,61 +34376,61 @@ { "name": "time", "unit": "", - "value": 6591.84 + "value": 6945.472 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 14.165827656832514 + "value": 7.1399684236094245 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2099880.0 + "value": 10584.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839716.0 + "value": 1870712.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9050248681758468 + "value": 0.9322472814983518 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2205696.0 + "value": 120599.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101222.0 + "value": 2101877.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.325839011182644 + "value": 3.041054828191242 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 67633152.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.30607954798965453 + "value": 0.03563163466300399 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -31620,13 +34460,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.3033417062571 + "value": 98.82279811846546 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97954995822947 + "value": 99.98081860056001 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -31638,7 +34478,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6996099072.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -31650,43 +34490,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 10880024576.0 + "value": 8738832384.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2181038080.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 2248146944.0 + "value": 1157627904.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 817102848.0 + "value": 970653696.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.54330941912097 + "value": 36.47595619079078 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -31698,13 +34538,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.124130490714727 + "value": 24.329195943531854 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.419231091910103 + "value": 1.6334299034353663 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -31716,7 +34556,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.52076799252309 + "value": 87.97391058321715 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -31735,30 +34575,47 @@ "time" ], "times": { - "compilation": 18608.595, - "data": 63942.851, - "framework": 2175718.323, - "kernel_overhead": 1005740.773, - "profiling_overhead": 53722.275, - "profiling_runs": 1052312.424, + "compilation_time": 14764.134, + "data": 58034.914, + "framework": 3120138.353, + "kernel_overhead": 1484122.298, + "profiling_overhead": 48005.368, + "profiling_runs": 1529975.773, "runtimes": [ - 6591.84 + 6945.472 ], - "search_algorithm": 29.999, - "validation": 17.152 + "search_algorithm": 26.452, + "validation": 16.898 }, - "timestamp": "2026-01-27 09:26:21 UTC" + "timestamp": "2026-03-13 09:41:11 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -31766,61 +34623,61 @@ { "name": "time", "unit": "", - "value": 6358.976 + "value": 9580.96 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.010354550168522 + "value": 6.315348934537718 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2108112.0 + "value": 28540.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1844012.0 + "value": 2289284.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9333949107315092 + "value": 81.76639960739128 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2217382.0 + "value": 133101522.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102508.0 + "value": 138421595.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.32570415884508 + "value": 1.108071134310571 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 67633152.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.30606952265340703 + "value": 0.01286605243691179 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -31838,25 +34695,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.80189336814355 + "value": 98.7460235550571 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97609425365243 + "value": 98.18829133473254 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -31868,7 +34725,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6996099072.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -31880,43 +34737,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 10880024576.0 + "value": 1918369792.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2181038080.0 + "value": 1245708288.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 2248146944.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 817102848.0 + "value": 258555904.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.54357690408299 + "value": 7.796475830097011 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -31928,13 +34785,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.12417777943274 + "value": 17.890575364678316 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.419255382794551 + "value": 5.195517430733608 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -31946,7 +34803,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.52092935772245 + "value": 17.232442707789907 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -31965,30 +34822,47 @@ "time" ], "times": { - "compilation": 21608.529, - "data": 63647.289, - "framework": 2171150.613, - "kernel_overhead": 1004154.78, - "profiling_overhead": 53396.012, - "profiling_runs": 1049952.532, + "compilation_time": 14054.273, + "data": 58866.964, + "framework": 551568.298, + "kernel_overhead": 191131.968, + "profiling_overhead": 48635.01, + "profiling_runs": 252934.356, "runtimes": [ - 6358.976 + 9580.96 ], - "search_algorithm": 34.695, - "validation": 16.475 + "search_algorithm": 34.668, + "validation": 13.016 }, - "timestamp": "2026-01-27 09:26:22 UTC" + "timestamp": "2026-03-13 09:41:11 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "2" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -31996,61 +34870,61 @@ { "name": "time", "unit": "", - "value": 6411.84 + "value": 2858.432 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.573995843824894 + "value": 17.34734664214523 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102140.0 + "value": 488.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1840508.0 + "value": 1870916.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9422751297632612 + "value": 2.200680532479743 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2210961.0 + "value": 48572.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2108643.0 + "value": 2099246.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.341394927900307 + "value": 14.858901635311028 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 67633152.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.3061730382223576 + "value": 0.17406320797727376 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -32080,13 +34954,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.37146713690687 + "value": 98.70357184047693 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95552641882381 + "value": 99.95780201358325 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -32098,7 +34972,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 6996099072.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -32110,43 +34984,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 10880024576.0 + "value": 3042967552.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2181038080.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 2248146944.0 + "value": 1644167168.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 25165824.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 817102848.0 + "value": 399179776.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 39.56514127052398 + "value": 45.94308334736106 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -32158,13 +35032,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.138390606672452 + "value": 59.438656905943574 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.426556112411825 + "value": 7.749082711858855 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -32176,7 +35050,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.56421677143032 + "value": 88.38969516411753 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -32195,30 +35069,47 @@ "time" ], "times": { - "compilation": 17769.848, - "data": 61664.789, - "framework": 2159080.7430000002, - "kernel_overhead": 1000396.238, - "profiling_overhead": 51374.141, - "profiling_runs": 1045645.575, + "compilation_time": 14430.771, + "data": 58779.781, + "framework": 1155892.145, + "kernel_overhead": 511364.765, + "profiling_overhead": 48573.338, + "profiling_runs": 537174.261, "runtimes": [ - 6411.84 + 2858.432 ], - "search_algorithm": 26.339, - "validation": 17.117 + "search_algorithm": 24.872, + "validation": 12.471 }, - "timestamp": "2026-01-27 09:26:23 UTC" + "timestamp": "2026-03-13 09:41:12 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -32226,61 +35117,61 @@ { "name": "time", "unit": "", - "value": 6629.952 + "value": 3538.432 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.704606155029035 + "value": 13.944986072423399 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103268.0 + "value": 11012.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1870716.0 + "value": 1872940.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8782740757950338 + "value": 1.8130216733063476 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2201237.0 + "value": 67561.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100744.0 + "value": 2104761.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.863979162711637 + "value": 6.037114045470682 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.14840688595631907 + "value": 0.07072536319269761 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -32310,13 +35201,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.75747187104278 + "value": 98.62355961730196 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97878579317475 + "value": 99.96455404141808 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -32328,7 +35219,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -32340,43 +35231,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 10322182144.0 + "value": 3301965824.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1929379840.0 + "value": 1429209088.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 838467584.0 + "value": 513540096.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 30.392614032630156 + "value": 42.22021019097227 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -32388,13 +35279,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 25.333482833989564 + "value": 48.29896797173099 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.605507731128138 + "value": 3.242728562555181 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -32406,7 +35297,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.13042865728826 + "value": 92.40083823626507 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -32425,30 +35316,47 @@ "time" ], "times": { - "compilation": 16306.776, - "data": 62940.6, - "framework": 2755674.253, - "kernel_overhead": 1297324.361, - "profiling_overhead": 52676.519, - "profiling_runs": 1342732.773, + "compilation_time": 14746.37, + "data": 59192.82, + "framework": 1840731.734, + "kernel_overhead": 851699.715, + "profiling_overhead": 48942.643, + "profiling_runs": 880896.556, "runtimes": [ - 6629.952 + 3538.432 ], - "search_algorithm": 24.797, - "validation": 14.054 + "search_algorithm": 25.644, + "validation": 17.5 }, - "timestamp": "2026-01-27 09:26:25 UTC" + "timestamp": "2026-03-13 09:41:13 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 56 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -32456,61 +35364,61 @@ { "name": "time", "unit": "", - "value": 6635.744 + "value": 5813.152 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.829065667358652 + "value": 8.653902702957021 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110136.0 + "value": 13716.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1871920.0 + "value": 1869772.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.878923365940108 + "value": 1.1220443013060788 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2206183.0 + "value": 105382.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102254.0 + "value": 2103924.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.864139505659509 + "value": 1.862824775326483 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.14840805548490232 + "value": 0.021825152536289465 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -32540,13 +35448,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.63391254019487 + "value": 73.8706463653676 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97880374221275 + "value": 99.97302261151874 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -32558,7 +35466,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -32570,43 +35478,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 10322182144.0 + "value": 4806148096.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1929379840.0 + "value": 1222115328.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 838467584.0 + "value": 868564992.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 30.39299106549274 + "value": 42.34317758635363 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -32618,13 +35526,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 25.33367792777056 + "value": 29.806649318460437 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.605558600307362 + "value": 1.058805536092772 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -32636,7 +35544,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.13105682337185 + "value": 96.44464756727571 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -32655,30 +35563,47 @@ "time" ], "times": { - "compilation": 16318.298, - "data": 60609.95, - "framework": 2755585.902, - "kernel_overhead": 1299558.953, - "profiling_overhead": 50340.208, - "profiling_runs": 1345076.791, + "compilation_time": 14190.299, + "data": 57768.658, + "framework": 3306099.486, + "kernel_overhead": 1580555.949, + "profiling_overhead": 47517.934, + "profiling_runs": 1620256.945, "runtimes": [ - 6635.744 + 5813.152 ], - "search_algorithm": 31.231, - "validation": 19.457 + "search_algorithm": 25.745, + "validation": 15.061 }, - "timestamp": "2026-01-27 09:26:26 UTC" + "timestamp": "2026-03-13 09:41:14 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -32686,61 +35611,61 @@ { "name": "time", "unit": "", - "value": 6582.976 + "value": 3381.28 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.541393426134533 + "value": 14.604445100236084 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110080.0 + "value": 544.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1845600.0 + "value": 1868152.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8776953761408255 + "value": 1.859550594773616 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2222424.0 + "value": 54367.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103404.0 + "value": 2099626.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.864180830307536 + "value": 6.226580988055755 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.14841752889766469 + "value": 0.07294534892666808 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -32770,13 +35695,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.83671576102546 + "value": 98.67722484740781 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97996246729286 + "value": 99.9631106709405 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -32788,7 +35713,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -32800,43 +35725,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 10322182144.0 + "value": 2563768320.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1929379840.0 + "value": 1362100224.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 838467584.0 + "value": 484179968.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 30.394779357296553 + "value": 42.76816609754263 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -32848,13 +35773,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 25.335001441068215 + "value": 49.81573489097275 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.605903696059778 + "value": 3.3445622790570084 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -32866,7 +35791,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.13517351983182 + "value": 89.85397162001281 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -32885,30 +35810,47 @@ "time" ], "times": { - "compilation": 19062.045, - "data": 62065.318, - "framework": 2757402.683, - "kernel_overhead": 1298737.031, - "profiling_overhead": 51914.423, - "profiling_runs": 1344685.911, + "compilation_time": 14398.826, + "data": 57768.798, + "framework": 1710377.773, + "kernel_overhead": 788286.29, + "profiling_overhead": 47603.974, + "profiling_runs": 816718.711, "runtimes": [ - 6582.976 + 3381.28 ], - "search_algorithm": 24.095, - "validation": 16.462 + "search_algorithm": 31.619, + "validation": 15.236 }, - "timestamp": "2026-01-27 09:26:28 UTC" + "timestamp": "2026-03-13 09:41:15 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -32916,61 +35858,61 @@ { "name": "time", "unit": "", - "value": 6692.096 + "value": 6325.632 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.672574465486889 + "value": 7.8109824754261075 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103472.0 + "value": 4732.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839176.0 + "value": 1868844.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8805231163092049 + "value": 1.0169308404857895 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2210989.0 + "value": 105846.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101075.0 + "value": 2102077.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.864486572967332 + "value": 1.6605954408216101 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.148409440123032 + "value": 0.019455918901244713 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -33000,13 +35942,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.71003872087879 + "value": 81.99701389020532 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97718948762238 + "value": 99.96841609519392 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -33018,7 +35960,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -33030,43 +35972,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 10322182144.0 + "value": 7219970048.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1929379840.0 + "value": 551026688.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 838467584.0 + "value": 952385536.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 30.39399923498373 + "value": 43.2446669083844 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -33078,13 +36020,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 25.334323336624614 + "value": 26.572207147109665 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.605726885623802 + "value": 0.9439101904063616 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -33096,7 +36038,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.13307279139869 + "value": 94.2761963369442 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -33115,30 +36057,47 @@ "time" ], "times": { - "compilation": 16256.327, - "data": 62786.725, - "framework": 2753235.6550000003, - "kernel_overhead": 1296527.019, - "profiling_overhead": 52051.265, - "profiling_runs": 1341870.646, + "compilation_time": 14989.173, + "data": 59245.283, + "framework": 3669195.885, + "kernel_overhead": 1758996.722, + "profiling_overhead": 48972.721, + "profiling_runs": 1801981.159, "runtimes": [ - 6692.096 + 6325.632 ], - "search_algorithm": 26.758, - "validation": 14.332 + "search_algorithm": 25.919, + "validation": 16.913 }, - "timestamp": "2026-01-27 09:26:29 UTC" + "timestamp": "2026-03-13 09:41:17 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 16, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 16, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 16, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -33146,61 +36105,61 @@ { "name": "time", "unit": "", - "value": 6637.472 + "value": 6000.64 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.704194748559056 + "value": 8.188328565350293 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2112852.0 + "value": 2880.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1842012.0 + "value": 1869268.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8831714117403198 + "value": 1.0628362318840578 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2219177.0 + "value": 99480.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103576.0 + "value": 2100167.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.850940677052956 + "value": 1.742217922090571 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.14825043404844845 + "value": 0.020411647515963294 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -33230,13 +36189,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.34779429722413 + "value": 81.9846673466584 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97326369611629 + "value": 99.97362415811504 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -33248,7 +36207,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -33260,43 +36219,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 10322182144.0 + "value": 5944901632.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 101187584.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1929379840.0 + "value": 1054343168.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 838467584.0 + "value": 913571840.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 30.36244940163969 + "value": 44.06007854343413 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -33308,13 +36267,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 25.3081738812115 + "value": 27.87605528606123 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.598908619417451 + "value": 0.9902260849907004 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -33326,7 +36285,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.05141359871571 + "value": 94.98207744088447 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -33345,30 +36304,47 @@ "time" ], "times": { - "compilation": 17609.468, - "data": 60841.479, - "framework": 2750682.1330000004, - "kernel_overhead": 1296888.073, - "profiling_overhead": 50858.424, - "profiling_runs": 1342094.157, + "compilation_time": 14804.353, + "data": 57760.812, + "framework": 3579066.4639999997, + "kernel_overhead": 1716305.222, + "profiling_overhead": 47557.957, + "profiling_runs": 1757442.473, "runtimes": [ - 6637.472 + 6000.64 ], - "search_algorithm": 27.862, - "validation": 18.176 + "search_algorithm": 25.253, + "validation": 16.634 }, - "timestamp": "2026-01-27 09:26:31 UTC" + "timestamp": "2026-03-13 09:41:19 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 256 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 1 }, "correctness": 1, "invalidity": "correct", @@ -33376,61 +36352,61 @@ { "name": "time", "unit": "", - "value": 8484.672 + "value": 7055.296 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.69730809524162 + "value": 6.888008002008792 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103780.0 + "value": 17036.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1940816.0 + "value": 1843912.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 48.202072912304594 + "value": 0.9238768326387793 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2944373.0 + "value": 133095.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138460017.0 + "value": 2111631.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.20234285576296 + "value": 47.98659024177635 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 134217728.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.058677990321808524 + "value": 0.5621688081920045 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -33448,25 +36424,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 93.33979928964085 + "value": 65.22088961988341 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.42438211546141 + "value": 99.94550797934127 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -33478,7 +36454,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 8589934592.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -33490,43 +36466,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4112515072.0 + "value": 2952790016.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2738880512.0 + "value": 4311744512.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 427819008.0 + "value": 318767104.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 50331648.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 421462016.0 + "value": 527958016.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.813513024892146 + "value": 21.374225298268623 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -33538,13 +36514,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.14471058676996 + "value": 23.99894666049764 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.875696366250919 + "value": 24.373930202067918 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -33556,7 +36532,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.628890140959598 + "value": 47.2014413043658 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -33575,30 +36551,47 @@ "time" ], "times": { - "compilation": 16368.786, - "data": 61692.419, - "framework": 934404.505, - "kernel_overhead": 382191.625, - "profiling_overhead": 51541.792, - "profiling_runs": 438978.669, + "compilation_time": 14686.102, + "data": 59040.489, + "framework": 504182.887, + "kernel_overhead": 174269.622, + "profiling_overhead": 48829.997, + "profiling_runs": 222042.779, "runtimes": [ - 8484.672 + 7055.296 ], - "search_algorithm": 52.091, - "validation": 16.318 + "search_algorithm": 22.682, + "validation": 15.559 }, - "timestamp": "2026-01-27 09:26:31 UTC" + "timestamp": "2026-03-13 09:41:19 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 39 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -33606,61 +36599,61 @@ { "name": "time", "unit": "", - "value": 8325.28 + "value": 4046.208 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.653682004726242 + "value": 13.075101565192156 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098748.0 + "value": 6940.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1928496.0 + "value": 1838620.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 48.52245116496773 + "value": 1.70131183578719 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 3069218.0 + "value": 68565.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138414916.0 + "value": 2100734.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.228566782588522 + "value": 45.69506274431203 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 67108864.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.05921142125095403 + "value": 0.5352690954516394 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -33678,25 +36671,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.64108358250778 + "value": 75.37722329320206 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 100.34417752912108 + "value": 99.90595873466445 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -33708,7 +36701,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 6442450944.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -33720,43 +36713,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4112515072.0 + "value": 1509949440.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 142606336.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2738880512.0 + "value": 2164260864.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 427819008.0 + "value": 184549376.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 421462016.0 + "value": 333971456.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.811480932041173 + "value": 28.977331254604366 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -33768,13 +36761,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.141509238168688 + "value": 45.71929114510185 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.873650191778715 + "value": 23.306123025139808 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -33786,7 +36779,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.62388491366387 + "value": 56.88180160056062 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -33805,30 +36798,47 @@ "time" ], "times": { - "compilation": 16235.35, - "data": 63129.813, - "framework": 942191.1910000001, - "kernel_overhead": 385124.364, - "profiling_overhead": 52901.143, - "profiling_runs": 441035.871, + "compilation_time": 14546.306, + "data": 57598.973, + "framework": 318209.226, + "kernel_overhead": 91205.528, + "profiling_overhead": 47398.963, + "profiling_runs": 122005.762, "runtimes": [ - 8325.28 + 4046.208 ], - "search_algorithm": 25.454, - "validation": 16.271 + "search_algorithm": 27.392, + "validation": 18.064 }, - "timestamp": "2026-01-27 09:26:32 UTC" + "timestamp": "2026-03-13 09:41:19 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -33836,61 +36846,61 @@ { "name": "time", "unit": "", - "value": 8686.112 + "value": 2245.568 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.409733723727276 + "value": 21.720855412566237 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2114208.0 + "value": 504.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1913932.0 + "value": 1835864.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 49.00090736700588 + "value": 2.7932784664881956 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2646364.0 + "value": 37807.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138420833.0 + "value": 2099487.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.170868742598362 + "value": 37.92586076828753 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.05888695602168006 + "value": 0.4442042515349954 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -33908,25 +36918,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 91.95780347905517 + "value": 96.14097415204756 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.00866937367914 + "value": 99.92453890266584 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -33938,7 +36948,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 5368709120.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -33950,43 +36960,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4112515072.0 + "value": 591396864.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 71303168.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2738880512.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 427819008.0 + "value": 146800640.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 421462016.0 + "value": 228327424.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.905897199945667 + "value": 43.23092087172374 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -33998,13 +37008,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.30133433353357 + "value": 75.86810983684879 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.975804024704807 + "value": 19.48565711630003 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -34016,7 +37026,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.874804867004364 + "value": 64.533377238322 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -34035,30 +37045,47 @@ "time" ], "times": { - "compilation": 15963.301, - "data": 63691.752, - "framework": 939418.405, - "kernel_overhead": 383513.6, - "profiling_overhead": 53455.57, - "profiling_runs": 438757.483, + "compilation_time": 15051.729, + "data": 57536.988, + "framework": 191903.772, + "kernel_overhead": 32064.379, + "profiling_overhead": 47284.411, + "profiling_runs": 55017.994, "runtimes": [ - 8686.112 + 2245.568 ], - "search_algorithm": 24.297, - "validation": 18.561 + "search_algorithm": 35.413, + "validation": 15.323 }, - "timestamp": "2026-01-27 09:26:32 UTC" + "timestamp": "2026-03-13 09:41:20 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -34066,61 +37093,61 @@ { "name": "time", "unit": "", - "value": 8315.903 + "value": 2088.448 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.804588962128866 + "value": 22.911341409202045 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2113916.0 + "value": 6112.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1909208.0 + "value": 1841048.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 49.30394277550515 + "value": 3.1503082026330875 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2788749.0 + "value": 41581.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138417400.0 + "value": 2101148.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.237576208131443 + "value": 21.532962939138056 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.059412943260542486 + "value": 0.252187037077323 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -34138,25 +37165,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.91559561741819 + "value": 97.84719307509752 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.7187760759627 + "value": 99.920498257184 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -34168,7 +37195,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4831838208.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -34180,25 +37207,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4112515072.0 + "value": 564133888.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 69206016.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2738880512.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 427819008.0 + "value": 115343360.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -34210,13 +37237,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 421462016.0 + "value": 192282624.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.92524447586708 + "value": 43.62083560969914 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -34228,13 +37255,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.336809941205836 + "value": 86.14833141394058 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 12.998478619647674 + "value": 11.231252191172917 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -34246,7 +37273,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.930523499258932 + "value": 61.709917226560606 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -34265,260 +37292,47 @@ "time" ], "times": { - "compilation": 15345.311, - "data": 63234.45, - "framework": 939786.992, - "kernel_overhead": 384740.823, - "profiling_overhead": 53002.033, - "profiling_runs": 438809.686, + "compilation_time": 15032.414, + "data": 57421.193, + "framework": 186134.418, + "kernel_overhead": 29616.632, + "profiling_overhead": 47323.853, + "profiling_runs": 51772.74, "runtimes": [ - 8315.903 + 2088.448 ], - "search_algorithm": 25.84, - "validation": 16.959 + "search_algorithm": 25.508, + "validation": 16.966 }, - "timestamp": "2026-01-27 09:26:33 UTC" + "timestamp": "2026-03-13 09:41:20 UTC" }, { - "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 }, - "correctness": 1, - "invalidity": "correct", - "measurements": [ - { - "name": "time", - "unit": "", - "value": 8305.76 - }, - { - "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 12.599896681002095 - }, - { - "name": "dram__sectors_read.sum", - "type": "Double", - "unit": "", - "value": 2120372.0 - }, - { - "name": "dram__sectors_write.sum", - "type": "Double", - "unit": "", - "value": 1914756.0 - }, - { - "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 49.449618003971274 - }, - { - "name": "lts__t_sectors_op_read.sum", - "type": "Double", - "unit": "", - "value": 2748911.0 - }, - { - "name": "lts__t_sectors_op_write.sum", - "type": "Double", - "unit": "", - "value": 138428102.0 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 5.305681437919774 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", - "type": "Double", - "unit": "", - "value": 17301504.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 0.05915589904618334 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", - "type": "Double", - "unit": "", - "value": 136314880.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", - "type": "Double", - "unit": "", - "value": 136314880.0 - }, - { - "name": "sm__warps_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 96.5804514676885 - }, - { - "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 97.72456359526332 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", - "type": "Double", - "unit": "", - "value": 4982833152.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", - "type": "Double", - "unit": "", - "value": 4112515072.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", - "type": "Double", - "unit": "", - "value": 1073741824.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", - "type": "Double", - "unit": "", - "value": 2738880512.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", - "type": "Double", - "unit": "", - "value": 427819008.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", - "type": "Double", - "unit": "", - "value": 6291456.0 - }, - { - "name": "smsp__inst_executed.sum", - "type": "Double", - "unit": "", - "value": 421462016.0 - }, - { - "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 12.11718599234574 - }, - { - "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 20.662031596672804 - }, - { - "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 13.206347343771826 - }, - { - "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 32.44115181140972 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.ratio", - "type": "Double", - "unit": "", - "value": 32.0 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.pct", - "type": "Double", - "unit": "", - "value": 100.0 - } - ], - "objectives": [ - "time" - ], - "times": { - "compilation": 16241.515, - "data": 66099.467, - "framework": 940942.9709999999, - "kernel_overhead": 382308.524, - "profiling_overhead": 54672.574, - "profiling_runs": 437862.406, - "runtimes": [ - 8305.76 - ], - "search_algorithm": 27.04, - "validation": 16.024 - }, - "timestamp": "2026-01-27 09:26:33 UTC" - }, - { "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -34526,61 +37340,61 @@ { "name": "time", "unit": "", - "value": 8382.176 + "value": 1845.792 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.958069590847593 + "value": 26.025491018288065 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098040.0 + "value": 7108.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2020036.0 + "value": 1839944.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 54.35421988512385 + "value": 3.3950844452034126 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 19521636.0 + "value": 39083.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138414788.0 + "value": 2104419.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.6972015760762176 + "value": 11.450248651536791 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.02970899976702806 + "value": 0.13409483368224762 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -34598,25 +37412,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 88.72108916250622 + "value": 98.27875871475187 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.85121851747836 + "value": 99.91699690211252 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -34628,7 +37442,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4563402752.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -34640,25 +37454,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3700424704.0 + "value": 209715200.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 67108864.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2470445056.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 558891008.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -34670,13 +37484,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 380665856.0 + "value": 194805760.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.843016300771886 + "value": 37.77836471462889 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -34688,13 +37502,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.311563686533972 + "value": 91.61811900408068 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.697992855599034 + "value": 6.151118829619675 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -34706,7 +37520,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.80396434923364 + "value": 66.48973405762794 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -34725,30 +37539,47 @@ "time" ], "times": { - "compilation": 17332.943, - "data": 62854.273, - "framework": 901366.937, - "kernel_overhead": 365122.954, - "profiling_overhead": 52436.008, - "profiling_runs": 420953.702, + "compilation_time": 15304.788, + "data": 56966.953, + "framework": 192340.883, + "kernel_overhead": 33616.358, + "profiling_overhead": 47170.566, + "profiling_runs": 54587.006, "runtimes": [ - 8382.176 + 1845.792 ], - "search_algorithm": 27.977, - "validation": 16.262 + "search_algorithm": 30.021, + "validation": 14.905 }, - "timestamp": "2026-01-27 09:26:34 UTC" + "timestamp": "2026-03-13 09:41:20 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -34756,61 +37587,61 @@ { "name": "time", "unit": "", - "value": 8404.416 + "value": 1815.296 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.598458436143298 + "value": 26.998551978336096 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109732.0 + "value": 224.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2020724.0 + "value": 1837440.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 54.47218814664348 + "value": 3.4912180035355482 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 19974717.0 + "value": 29534.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138418395.0 + "value": 2098995.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.661265779776335 + "value": 5.930339230548586 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.029150844303390547 + "value": 0.06941621460637369 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -34828,25 +37659,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 89.93756170552965 + "value": 81.69454514104886 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 97.7135052625838 + "value": 99.86655823019383 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -34858,7 +37689,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4429185024.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -34870,43 +37701,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3700424704.0 + "value": 138412032.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 33554432.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2470445056.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 283639808.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 380665856.0 + "value": 165953536.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.869885896735733 + "value": 37.63854472271837 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -34918,13 +37749,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.365976696503576 + "value": 94.90291180100705 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.72933081715135 + "value": 3.3711849773062803 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -34936,7 +37767,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 28.881146921729446 + "value": 58.67266577486683 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -34955,30 +37786,47 @@ "time" ], "times": { - "compilation": 16595.887, - "data": 64227.387, - "framework": 904265.9639999999, - "kernel_overhead": 365326.182, - "profiling_overhead": 53941.799, - "profiling_runs": 420770.596, + "compilation_time": 14822.072, + "data": 58469.61, + "framework": 169397.222, + "kernel_overhead": 20870.886, + "profiling_overhead": 48516.819, + "profiling_runs": 41539.907, "runtimes": [ - 8404.416 + 1815.296 ], - "search_algorithm": 27.969, - "validation": 17.114 + "search_algorithm": 26.274, + "validation": 15.603 }, - "timestamp": "2026-01-27 09:26:34 UTC" + "timestamp": "2026-03-13 09:41:20 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 22 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -34986,61 +37834,61 @@ { "name": "time", "unit": "", - "value": 8325.663 + "value": 6406.208 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.663521731090055 + "value": 7.142914835874903 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2116572.0 + "value": 488.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2003056.0 + "value": 1838960.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 53.957994109752505 + "value": 0.9928045163900729 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 16618676.0 + "value": 104956.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138429918.0 + "value": 2099180.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.719859342235066 + "value": 26.284801479237004 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 67108864.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.029423810529050076 + "value": 0.3079633738367533 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -35058,25 +37906,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 89.29215814871625 + "value": 97.94656402106256 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 97.94727309364454 + "value": 99.97406804457385 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -35088,7 +37936,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 6979321856.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -35100,7 +37948,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3700424704.0 + "value": 10880024576.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -35112,31 +37960,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2470445056.0 + "value": 2164260864.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 2248146944.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 380665856.0 + "value": 816054272.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.938282317708483 + "value": 43.023977601544544 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -35148,13 +37996,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.50762008993749 + "value": 26.286357800657633 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.810907175820933 + "value": 13.399881613225864 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -35166,7 +38014,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.081992788146117 + "value": 79.91189451806795 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -35185,30 +38033,47 @@ "time" ], "times": { - "compilation": 18570.168, - "data": 62424.495, - "framework": 898649.299, - "kernel_overhead": 364215.53, - "profiling_overhead": 52063.207, - "profiling_runs": 419946.067, + "compilation_time": 15390.895, + "data": 58967.596, + "framework": 2156706.778, + "kernel_overhead": 1002240.735, + "profiling_overhead": 48337.885, + "profiling_runs": 1047160.562, "runtimes": [ - 8325.663 + 6406.208 ], - "search_algorithm": 35.47, - "validation": 16.181 + "search_algorithm": 30.12, + "validation": 15.675 }, - "timestamp": "2026-01-27 09:26:35 UTC" + "timestamp": "2026-03-13 09:41:21 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 25 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -35216,61 +38081,61 @@ { "name": "time", "unit": "", - "value": 8582.88 + "value": 6276.64 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.349013430593837 + "value": 7.828358509837652 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102308.0 + "value": 20000.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1998096.0 + "value": 1843976.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 54.309248608013974 + "value": 1.0333337798774174 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 17416864.0 + "value": 122121.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138420535.0 + "value": 2105276.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.745770566741575 + "value": 13.383591981725798 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.030227547903332853 + "value": 0.15679773296611085 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -35288,25 +38153,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.52211884400522 + "value": 98.7814904321765 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 100.63692655785765 + "value": 99.96615747172844 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -35318,7 +38183,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -35330,7 +38195,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3700424704.0 + "value": 10854858752.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -35342,31 +38207,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2470445056.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 1392508928.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 380665856.0 + "value": 837287936.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.93720146204321 + "value": 36.23264143584361 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -35378,13 +38243,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.504739238843573 + "value": 26.769205802257957 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.80924801377734 + "value": 6.875294068353361 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -35396,7 +38261,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.077927141616577 + "value": 83.49729147773967 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -35415,30 +38280,47 @@ "time" ], "times": { - "compilation": 16932.544, - "data": 63516.244, - "framework": 899435.8759999999, - "kernel_overhead": 363102.23, - "profiling_overhead": 53315.623, - "profiling_runs": 419501.779, + "compilation_time": 15493.793, + "data": 58680.208, + "framework": 2737684.8310000002, + "kernel_overhead": 1293572.354, + "profiling_overhead": 48603.459, + "profiling_runs": 1336828.81, "runtimes": [ - 8582.88 + 6276.64 ], - "search_algorithm": 25.856, - "validation": 13.608 + "search_algorithm": 25.352, + "validation": 17.037 }, - "timestamp": "2026-01-27 09:26:36 UTC" + "timestamp": "2026-03-13 09:41:23 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 32, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -35446,61 +38328,61 @@ { "name": "time", "unit": "", - "value": 8268.384 + "value": 8479.136 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.75159414667144 + "value": 5.953568349656736 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104456.0 + "value": 21788.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1997436.0 + "value": 1914080.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 52.177182552034694 + "value": 48.77726551308113 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 10434381.0 + "value": 682799.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138415821.0 + "value": 138427026.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.722752516003285 + "value": 5.12656241605026 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.02988937937773416 + "value": 0.0599462503420395 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -35530,13 +38412,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.4565038130171 + "value": 94.23877268587596 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.44669560724635 + "value": 99.5653249694866 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -35548,7 +38430,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -35560,7 +38442,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3700424704.0 + "value": 4250927104.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -35572,31 +38454,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2470445056.0 + "value": 2722103296.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 213909504.0 + "value": 289406976.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 380665856.0 + "value": 420413440.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.942640291463281 + "value": 12.351833362762246 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -35608,13 +38490,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 20.51801003938675 + "value": 20.550983443637254 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 11.816891035867513 + "value": 13.055092509849645 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -35626,7 +38508,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.09674901104483 + "value": 32.18651685810803 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -35645,30 +38527,47 @@ "time" ], "times": { - "compilation": 15897.435, - "data": 63130.672, - "framework": 897794.649, - "kernel_overhead": 363688.885, - "profiling_overhead": 52831.273, - "profiling_runs": 418143.819, + "compilation_time": 14126.985, + "data": 58974.042, + "framework": 919233.074, + "kernel_overhead": 378167.332, + "profiling_overhead": 48827.317, + "profiling_runs": 433264.383, "runtimes": [ - 8268.384 + 8479.136 ], - "search_algorithm": 29.739, - "validation": 19.874 + "search_algorithm": 24.575, + "validation": 14.584 }, - "timestamp": "2026-01-27 09:26:36 UTC" + "timestamp": "2026-03-13 09:41:23 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 34 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -35676,61 +38575,61 @@ { "name": "time", "unit": "", - "value": 9877.856 + "value": 8451.072 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.543212515321635 + "value": 6.189792406575101 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2122496.0 + "value": 12368.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2241156.0 + "value": 2004304.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 75.23389293331387 + "value": 53.65165240203037 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 120374550.0 + "value": 14880280.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138424177.0 + "value": 138424098.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.2234255362234265 + "value": 2.57731265414941 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01236474898613687 + "value": 0.029960495448286835 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -35760,13 +38659,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 94.53290476545217 + "value": 89.21683042164096 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 95.79233833241638 + "value": 98.74200114751038 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -35778,7 +38677,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -35790,7 +38689,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3494379520.0 + "value": 3769630720.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -35802,31 +38701,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2336227328.0 + "value": 2453667840.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 144703488.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 360267776.0 + "value": 379617280.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.64364782154687 + "value": 10.17049792190932 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -35838,13 +38737,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.623543012895944 + "value": 20.71360851681453 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.5926975457157 + "value": 11.848629090550892 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -35856,7 +38755,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.652874541946392 + "value": 29.293215885462736 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -35875,30 +38774,47 @@ "time" ], "times": { - "compilation": 17589.849, - "data": 62729.402, - "framework": 887414.433, - "kernel_overhead": 353829.081, - "profiling_overhead": 52056.329, - "profiling_runs": 418799.621, + "compilation_time": 14679.663, + "data": 58793.581, + "framework": 884792.574, + "kernel_overhead": 361064.44, + "profiling_overhead": 48506.7, + "profiling_runs": 416427.853, "runtimes": [ - 9877.856 + 8451.072 ], - "search_algorithm": 27.59, - "validation": 17.035 + "search_algorithm": 24.12, + "validation": 13.994 }, - "timestamp": "2026-01-27 09:26:37 UTC" + "timestamp": "2026-03-13 09:41:24 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -35906,61 +38822,61 @@ { "name": "time", "unit": "", - "value": 9989.024 + "value": 9429.248 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.235121503935137 + "value": 5.9656712162143535 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2119092.0 + "value": 19036.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2237072.0 + "value": 2197040.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 76.86252004810848 + "value": 73.0614777410446 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 125909496.0 + "value": 100502254.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138422342.0 + "value": 138425449.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.2278683548857303 + "value": 1.141309117724621 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012395668528928906 + "value": 0.01299706969386697 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -35990,13 +38906,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.65838606697619 + "value": 89.4785903352049 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 96.87663293362884 + "value": 97.4832408878674 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -36008,7 +38924,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -36020,7 +38936,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3494379520.0 + "value": 3528982528.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -36032,13 +38948,13 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2336227328.0 + "value": 2319450112.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -36050,13 +38966,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 360267776.0 + "value": 359219200.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.578614411845605 + "value": 7.936586858167701 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -36068,13 +38984,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.469867520160978 + "value": 18.203469978090965 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.509050204150123 + "value": 9.837251171021569 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -36086,7 +39002,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.446638668124685 + "value": 24.360116990699073 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -36105,30 +39021,47 @@ "time" ], "times": { - "compilation": 17014.591, - "data": 62021.511, - "framework": 888373.475, - "kernel_overhead": 354811.181, - "profiling_overhead": 51763.772, - "profiling_runs": 419777.011, + "compilation_time": 15226.934, + "data": 58512.256, + "framework": 868638.654, + "kernel_overhead": 349665.948, + "profiling_overhead": 48296.749, + "profiling_runs": 412163.701, "runtimes": [ - 9989.024 + 9429.248 ], - "search_algorithm": 25.109, - "validation": 17.256 + "search_algorithm": 25.432, + "validation": 15.772 }, - "timestamp": "2026-01-27 09:26:37 UTC" + "timestamp": "2026-03-13 09:41:24 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 26 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -36136,61 +39069,61 @@ { "name": "time", "unit": "", - "value": 9547.712 + "value": 4029.632 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 10.52512556294973 + "value": 12.070048927456444 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2105336.0 + "value": 3028.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2178380.0 + "value": 1837892.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 75.15960045961947 + "value": 1.5711473659317747 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 112149281.0 + "value": 69090.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138418778.0 + "value": 2100102.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.2554828990239648 + "value": 20.8945489218635 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012985112517775274 + "value": 0.2447844482989717 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -36208,25 +39141,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 92.79451362107109 + "value": 98.64304255796716 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.81126669170274 + "value": 99.96224141866213 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -36238,7 +39171,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -36250,43 +39183,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3494379520.0 + "value": 5221908480.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2336227328.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 2193620992.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 360267776.0 + "value": 535298048.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.705538846611963 + "value": 39.608644392329126 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -36298,13 +39231,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.76253076956122 + "value": 41.79232603244245 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.668350183285337 + "value": 10.73377123684801 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -36316,7 +39249,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.839412513843687 + "value": 83.34044399253526 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -36335,30 +39268,47 @@ "time" ], "times": { - "compilation": 15674.127, - "data": 64101.311, - "framework": 890274.67, - "kernel_overhead": 354652.954, - "profiling_overhead": 53846.121, - "profiling_runs": 417674.284, + "compilation_time": 15021.459, + "data": 58517.358, + "framework": 1376262.369, + "kernel_overhead": 618734.556, + "profiling_overhead": 48269.183, + "profiling_runs": 650741.272, "runtimes": [ - 9547.712 + 4029.632 ], - "search_algorithm": 24.356, - "validation": 15.265 + "search_algorithm": 24.141, + "validation": 15.923 }, - "timestamp": "2026-01-27 09:26:38 UTC" + "timestamp": "2026-03-13 09:41:25 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 30 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -36366,61 +39316,61 @@ { "name": "time", "unit": "", - "value": 9812.0 + "value": 4286.048 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 10.565723711620038 + "value": 11.316152693430082 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110736.0 + "value": 592.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2174140.0 + "value": 1835920.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 76.98080504913275 + "value": 1.476767000171609 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 123083830.0 + "value": 70665.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138417246.0 + "value": 2098998.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.2281542243552246 + "value": 9.81806999980185 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012695353785663993 + "value": 0.11502274455894351 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -36438,25 +39388,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 95.10119402100007 + "value": 98.87989960758149 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 98.49302783784859 + "value": 99.96392829326905 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -36468,7 +39418,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -36480,43 +39430,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3494379520.0 + "value": 5863636992.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2336227328.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 1373634560.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 360267776.0 + "value": 621477888.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.631432320114276 + "value": 43.26459964327525 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -36528,13 +39478,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.598595639239505 + "value": 39.275264067524965 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.579118402754998 + "value": 5.120359133803303 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -36546,7 +39496,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 23.61940710446104 + "value": 90.93004663706377 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -36565,30 +39515,47 @@ "time" ], "times": { - "compilation": 15543.508, - "data": 61781.74, - "framework": 887394.0250000001, - "kernel_overhead": 355005.602, - "profiling_overhead": 51532.81, - "profiling_runs": 419073.873, + "compilation_time": 14719.263, + "data": 58701.633, + "framework": 1886054.281, + "kernel_overhead": 873075.591, + "profiling_overhead": 48396.837, + "profiling_runs": 905880.22, "runtimes": [ - 9812.0 + 4286.048 ], - "search_algorithm": 34.638, - "validation": 19.786 + "search_algorithm": 25.972, + "validation": 15.853 }, - "timestamp": "2026-01-27 09:26:38 UTC" + "timestamp": "2026-03-13 09:41:26 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "1", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -36596,61 +39563,61 @@ { "name": "time", "unit": "", - "value": 8795.232 + "value": 6944.48 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 12.574023045156856 + "value": 7.030240466358999 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109160.0 + "value": 13220.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2201368.0 + "value": 1839164.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 61.138371135898886 + "value": 0.9341695118239 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 47461292.0 + "value": 124925.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138417946.0 + "value": 2102056.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.3543240192731203 + "value": 3.0410618287599966 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.013939473869714334 + "value": 0.03563169768917074 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -36668,25 +39635,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 75.82328961472544 + "value": 98.84785016119069 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 97.14499130473135 + "value": 99.97963457675819 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -36698,7 +39665,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -36710,43 +39677,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 3494379520.0 + "value": 8738832384.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 1073741824.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 2336227328.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 1157627904.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 360267776.0 + "value": 970653696.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.49846913352596 + "value": 36.4764868523816 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -36758,13 +39725,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 19.591363453571113 + "value": 24.329527100307267 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.663804887631054 + "value": 1.6334521368614499 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -36776,7 +39743,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.293940486785722 + "value": 87.9750908192645 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -36795,30 +39762,47 @@ "time" ], "times": { - "compilation": 14933.783, - "data": 61425.48, - "framework": 879562.916, - "kernel_overhead": 354768.44, - "profiling_overhead": 51278.529, - "profiling_runs": 412090.467, + "compilation_time": 15109.783, + "data": 59832.545, + "framework": 3116222.4579999996, + "kernel_overhead": 1480422.893, + "profiling_overhead": 49750.324, + "profiling_runs": 1526216.696, "runtimes": [ - 8795.232 + 6944.48 ], - "search_algorithm": 25.736, - "validation": 18.537 + "search_algorithm": 36.836, + "validation": 17.855 }, - "timestamp": "2026-01-27 09:26:39 UTC" + "timestamp": "2026-03-13 09:41:28 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -36826,61 +39810,61 @@ { "name": "time", "unit": "", - "value": 4009.472 + "value": 9658.688 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 25.848521277655966 + "value": 5.647078859972619 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109932.0 + "value": 7756.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1873900.0 + "value": 2199220.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.0616242797158013 + "value": 81.6655203211476 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2164583.0 + "value": 133070327.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2106412.0 + "value": 138424154.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 21.377761939015762 + "value": 1.1182091927382722 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.24656317955740825 + "value": 0.012919427999895282 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -36898,25 +39882,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.31239162124609 + "value": 98.8982202127256 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95478866156505 + "value": 98.52452539921906 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -36928,7 +39912,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -36940,7 +39924,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5477761024.0 + "value": 1918369792.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -36952,31 +39936,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 1245708288.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1933574144.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 536215552.0 + "value": 258555904.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.530676447895175 + "value": 7.802397388600814 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -36988,13 +39972,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.09914956678655 + "value": 17.903487100681676 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.977024349933604 + "value": 5.199267066958217 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -37006,7 +39990,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.09626922984211 + "value": 17.24486225800338 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -37025,30 +40009,47 @@ "time" ], "times": { - "compilation": 14967.04, - "data": 61628.594, - "framework": 1314441.528, - "kernel_overhead": 584784.8, - "profiling_overhead": 51360.264, - "profiling_runs": 616667.87, + "compilation_time": 14179.867, + "data": 58929.24, + "framework": 547677.12, + "kernel_overhead": 188348.898, + "profiling_overhead": 48748.344, + "profiling_runs": 251650.638, "runtimes": [ - 4009.472 + 9658.688 ], - "search_algorithm": 38.007, - "validation": 17.312 + "search_algorithm": 25.84, + "validation": 15.836 }, - "timestamp": "2026-01-27 09:26:39 UTC" + "timestamp": "2026-03-13 09:41:28 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -37056,61 +40057,61 @@ { "name": "time", "unit": "", - "value": 4073.567 + "value": 2836.576 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 25.24782280821251 + "value": 17.137482991979375 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2113076.0 + "value": 476.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1872412.0 + "value": 1837408.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.0944886500878797 + "value": 2.211686907620278 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2170105.0 + "value": 48048.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101362.0 + "value": 2099180.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 21.37707977457399 + "value": 14.85838960015036 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2466065757137678 + "value": 0.17406386137898583 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -37140,13 +40141,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.00113438167384 + "value": 98.80180675968877 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96856427178862 + "value": 99.95840720269969 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -37158,7 +40159,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -37170,43 +40171,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5477761024.0 + "value": 3042967552.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1933574144.0 + "value": 1644167168.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 536215552.0 + "value": 399179776.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.532685955222945 + "value": 45.942716146283 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -37218,13 +40219,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.10075693467462 + "value": 59.438520160569176 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.977443458552855 + "value": 7.749064884214829 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -37236,7 +40237,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.09950830081979 + "value": 88.38944863557491 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -37255,30 +40256,47 @@ "time" ], "times": { - "compilation": 14523.727, - "data": 62057.871, - "framework": 1313678.7340000002, - "kernel_overhead": 583735.826, - "profiling_overhead": 51582.514, - "profiling_runs": 616302.523, + "compilation_time": 15016.712, + "data": 57524.719, + "framework": 1149413.441, + "kernel_overhead": 509331.702, + "profiling_overhead": 47367.496, + "profiling_runs": 535189.524, "runtimes": [ - 4073.567 + 2836.576 ], - "search_algorithm": 29.419, - "validation": 17.523 + "search_algorithm": 38.789, + "validation": 18.614 }, - "timestamp": "2026-01-27 09:26:40 UTC" + "timestamp": "2026-03-13 09:41:28 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -37286,61 +40304,61 @@ { "name": "time", "unit": "", - "value": 3999.392 + "value": 3615.2 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 25.989545462231767 + "value": 13.71812342761302 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097388.0 + "value": 4672.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837188.0 + "value": 1837920.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.0886645812046156 + "value": 1.7799336295098027 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2162935.0 + "value": 62774.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099195.0 + "value": 2100248.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 21.379763894424592 + "value": 6.037291361586821 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.24660886470363963 + "value": 0.07073072064059192 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -37370,13 +40388,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.48363668246468 + "value": 98.70057908323197 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95837170917875 + "value": 99.96646309471538 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -37388,7 +40406,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -37400,43 +40418,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5477761024.0 + "value": 3301965824.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1933574144.0 + "value": 1429209088.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 536215552.0 + "value": 513540096.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.53684523707941 + "value": 42.22282983642167 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -37448,13 +40466,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.10544068472097 + "value": 48.30170418742828 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.978664709785644 + "value": 3.2429122684430607 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -37466,7 +40484,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.10883559137594 + "value": 92.40603906687596 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -37485,30 +40503,47 @@ "time" ], "times": { - "compilation": 14217.079, - "data": 61176.929, - "framework": 1322227.118, - "kernel_overhead": 589299.283, - "profiling_overhead": 50834.864, - "profiling_runs": 620916.042, + "compilation_time": 14414.732, + "data": 57929.212, + "framework": 1831595.019, + "kernel_overhead": 848653.79, + "profiling_overhead": 47127.48, + "profiling_runs": 877884.537, "runtimes": [ - 3999.392 + 3615.2 ], - "search_algorithm": 21.816, - "validation": 15.118 + "search_algorithm": 26.839, + "validation": 16.371 }, - "timestamp": "2026-01-27 09:26:41 UTC" + "timestamp": "2026-03-13 09:41:29 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 56 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -37516,61 +40551,61 @@ { "name": "time", "unit": "", - "value": 3994.912 + "value": 5674.592 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 25.383935796026886 + "value": 8.54729196919561 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098716.0 + "value": 3984.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1836452.0 + "value": 1834332.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.1018743558087447 + "value": 1.127958455212252 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2164070.0 + "value": 94998.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100424.0 + "value": 2099701.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 21.37951951426517 + "value": 1.8628873657125555 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.2465971851136014 + "value": 0.021826038727080128 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -37600,13 +40635,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.25322994951983 + "value": 73.90327559477647 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95469666619691 + "value": 99.97480138713286 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -37618,7 +40653,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -37630,43 +40665,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5477761024.0 + "value": 4806148096.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1933574144.0 + "value": 1222115328.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 536215552.0 + "value": 868564992.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.536565981077764 + "value": 42.34411472303697 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -37678,13 +40713,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.104994558954125 + "value": 29.807329242208514 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.978548385977296 + "value": 1.0588296886575534 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -37696,7 +40731,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 84.10797328207471 + "value": 96.44684757994924 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -37715,30 +40750,47 @@ "time" ], "times": { - "compilation": 14548.558, - "data": 61868.308, - "framework": 1312452.302, - "kernel_overhead": 583586.022, - "profiling_overhead": 51562.535, - "profiling_runs": 615435.437, + "compilation_time": 15029.206, + "data": 58526.07, + "framework": 3300689.613, + "kernel_overhead": 1577171.897, + "profiling_overhead": 48422.668, + "profiling_runs": 1616568.978, "runtimes": [ - 3994.912 + 5674.592 ], - "search_algorithm": 25.687, - "validation": 15.112 + "search_algorithm": 26.4, + "validation": 16.421 }, - "timestamp": "2026-01-27 09:26:42 UTC" + "timestamp": "2026-03-13 09:41:31 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "4" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -37746,61 +40798,61 @@ { "name": "time", "unit": "", - "value": 3997.184 + "value": 3466.592 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 25.730564526897336 + "value": 14.106891281881289 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2105748.0 + "value": 10952.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1840736.0 + "value": 1841100.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.0748507787101924 + "value": 1.8598817527805969 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2173142.0 + "value": 68734.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2108563.0 + "value": 2105214.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 21.271784923590474 + "value": 6.226870093893029 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 34078720.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.24536633425332396 + "value": 0.07293362013751382 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -37830,13 +40882,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 97.81580167448942 + "value": 98.72792562083042 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96151455854228 + "value": 99.94187638445823 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -37848,7 +40900,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 5653921792.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -37860,43 +40912,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5477761024.0 + "value": 2563768320.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1107296256.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1933574144.0 + "value": 1362100224.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 12582912.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 536215552.0 + "value": 484179968.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 42.32203061158044 + "value": 42.7697103905528 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -37908,13 +40960,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 41.89197669139901 + "value": 49.8183075487585 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 10.92300564121439 + "value": 3.3447350038839323 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -37926,7 +40978,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 83.68245179344001 + "value": 89.85857932814986 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -37945,30 +40997,47 @@ "time" ], "times": { - "compilation": 17351.073, - "data": 61510.676, - "framework": 1320798.738, - "kernel_overhead": 588151.352, - "profiling_overhead": 51213.737, - "profiling_runs": 619922.973, + "compilation_time": 14099.974, + "data": 57388.796, + "framework": 1705241.5129999998, + "kernel_overhead": 785917.847, + "profiling_overhead": 47302.62, + "profiling_runs": 814632.25, "runtimes": [ - 3997.184 + 3466.592 ], - "search_algorithm": 30.829, - "validation": 20.533 + "search_algorithm": 25.746, + "validation": 15.669 }, - "timestamp": "2026-01-27 09:26:42 UTC" + "timestamp": "2026-03-13 09:41:32 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -37976,61 +41045,61 @@ { "name": "time", "unit": "", - "value": 4761.248 + "value": 6361.76 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 21.86783358029918 + "value": 7.638054406069288 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098036.0 + "value": 6340.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1868824.0 + "value": 1836472.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.5819755090413894 + "value": 1.012159186207411 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2165986.0 + "value": 108586.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103038.0 + "value": 2100872.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.004766282090744 + "value": 1.6606308719699006 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.10230745012293255 + "value": 0.019456758683876946 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -38060,13 +41129,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 99.02932004969921 + "value": 82.03142035818168 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97736110471065 + "value": 99.97402509652693 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -38078,7 +41147,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -38090,43 +41159,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5460983808.0 + "value": 7219970048.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1774190592.0 + "value": 551026688.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 622460928.0 + "value": 952385536.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.9870576086791 + "value": 43.244345527063615 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -38138,13 +41207,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 34.92885048118052 + "value": 26.571863205538627 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.690153262853829 + "value": 0.9438979727553395 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -38156,7 +41225,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 80.99514343873363 + "value": 94.27497605806782 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -38175,30 +41244,47 @@ "time" ], "times": { - "compilation": 18300.148, - "data": 61104.089, - "framework": 1936693.9, - "kernel_overhead": 894830.157, - "profiling_overhead": 50558.534, - "profiling_runs": 930201.12, + "compilation_time": 14438.902, + "data": 57913.447, + "framework": 3664526.314, + "kernel_overhead": 1757997.859, + "profiling_overhead": 47708.651, + "profiling_runs": 1800906.357, "runtimes": [ - 4761.248 + 6361.76 ], - "search_algorithm": 26.659, - "validation": 16.685 + "search_algorithm": 25.55, + "validation": 14.45 }, - "timestamp": "2026-01-27 09:26:43 UTC" + "timestamp": "2026-03-13 09:41:34 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 128, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 2, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 16, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 2, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -38206,61 +41292,61 @@ { "name": "time", "unit": "", - "value": 4848.928 + "value": 6265.056 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 21.67117196435047 + "value": 7.498591184524243 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2107760.0 + "value": 5608.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1869916.0 + "value": 1838148.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.5733276875189075 + "value": 1.0484108100397298 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2173412.0 + "value": 104776.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101290.0 + "value": 2100980.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.004817507291069 + "value": 1.7422229010057622 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.10230176898890211 + "value": 0.020412346678990268 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -38290,13 +41376,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.87764828547452 + "value": 82.02418695483709 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97216982172743 + "value": 99.97476499139685 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -38308,7 +41394,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -38320,43 +41406,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5460983808.0 + "value": 5944901632.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 101187584.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1774190592.0 + "value": 1054343168.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 622460928.0 + "value": 913571840.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.98690323976678 + "value": 44.06080943256221 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -38368,13 +41454,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 34.92872454118673 + "value": 27.876692018010395 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.690136351965992 + "value": 0.9902487032764924 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -38386,7 +41472,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 80.99487586390642 + "value": 94.9842482237091 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -38405,30 +41491,47 @@ "time" ], "times": { - "compilation": 18000.121, - "data": 64139.019, - "framework": 1941409.074, - "kernel_overhead": 893566.658, - "profiling_overhead": 54021.405, - "profiling_runs": 929681.992, + "compilation_time": 15693.446, + "data": 58947.638, + "framework": 3575360.8310000002, + "kernel_overhead": 1712752.872, + "profiling_overhead": 48386.604, + "profiling_runs": 1755273.717, "runtimes": [ - 4848.928 + 6265.056 ], - "search_algorithm": 28.715, - "validation": 17.248 + "search_algorithm": 26.449, + "validation": 17.386 }, - "timestamp": "2026-01-27 09:26:44 UTC" + "timestamp": "2026-03-13 09:41:36 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 256 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 1 }, "correctness": 1, "invalidity": "correct", @@ -38436,61 +41539,61 @@ { "name": "time", "unit": "", - "value": 4846.016 + "value": 6994.592 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 21.471831845445667 + "value": 6.700676181312808 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102104.0 + "value": 5452.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838544.0 + "value": 1839792.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.5777456164700485 + "value": 0.9210978102341923 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2180920.0 + "value": 125716.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100904.0 + "value": 2107871.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.005602455223926 + "value": 48.01319231114698 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 134217728.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.10231273576245593 + "value": 0.5624822550649666 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -38520,13 +41623,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 99.03634631257495 + "value": 81.10450881377695 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97531870733587 + "value": 99.93720279558327 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -38538,7 +41641,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 8589934592.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -38550,43 +41653,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5460983808.0 + "value": 2952790016.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 4311744512.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1774190592.0 + "value": 318767104.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 50331648.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 622460928.0 + "value": 527958016.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.98946285313594 + "value": 21.386473655179945 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -38598,13 +41701,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 34.931368653580606 + "value": 24.01432320640513 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.690491396354817 + "value": 24.38954700650521 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -38616,7 +41719,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.00098325060928 + "value": 47.2317023453156 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -38635,30 +41738,47 @@ "time" ], "times": { - "compilation": 19116.588, - "data": 61731.469, - "framework": 1935931.361, - "kernel_overhead": 893564.107, - "profiling_overhead": 51534.454, - "profiling_runs": 929101.331, + "compilation_time": 14035.796, + "data": 58984.757, + "framework": 501385.99299999996, + "kernel_overhead": 172882.946, + "profiling_overhead": 48854.266, + "profiling_runs": 220664.024, "runtimes": [ - 4846.016 + 6994.592 ], - "search_algorithm": 24.558, - "validation": 17.093 + "search_algorithm": 23.968, + "validation": 13.525 }, - "timestamp": "2026-01-27 09:26:45 UTC" + "timestamp": "2026-03-13 09:41:36 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 39 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -38666,61 +41786,61 @@ { "name": "time", "unit": "", - "value": 4881.568 + "value": 3755.936 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 21.379863947933256 + "value": 13.033427242542903 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110472.0 + "value": 6904.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1842032.0 + "value": 1842116.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.5665485530134613 + "value": 1.6922021141170684 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2191152.0 + "value": 71211.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2109009.0 + "value": 2101554.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 9.005753582900875 + "value": 45.73463816262577 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 67108864.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.10231155592288615 + "value": 0.5357322698908262 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -38750,13 +41870,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.9856794037345 + "value": 92.78081628367683 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97025618104216 + "value": 99.94397690581044 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -38768,7 +41888,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 6442450944.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -38780,43 +41900,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5460983808.0 + "value": 1509949440.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 142606336.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 2164260864.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1774190592.0 + "value": 184549376.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 622460928.0 + "value": 333971456.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.99043280774745 + "value": 28.990678582704803 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -38828,13 +41948,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 34.932734751059854 + "value": 45.74144613747044 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.690674832295635 + "value": 23.31741687867145 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -38846,7 +41966,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.0041749853352 + "value": 56.909392898460645 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -38865,30 +41985,47 @@ "time" ], "times": { - "compilation": 16520.803, - "data": 60652.393, - "framework": 1929983.3020000001, - "kernel_overhead": 891601.214, - "profiling_overhead": 50470.356, - "profiling_runs": 927259.339, + "compilation_time": 14700.399, + "data": 59555.341, + "framework": 320751.338, + "kernel_overhead": 90735.66, + "profiling_overhead": 49288.223, + "profiling_runs": 121172.114, "runtimes": [ - 4881.568 + 3755.936 ], - "search_algorithm": 31.139, - "validation": 17.277 + "search_algorithm": 38.673, + "validation": 17.642 }, - "timestamp": "2026-01-27 09:26:46 UTC" + "timestamp": "2026-03-13 09:41:36 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -38896,61 +42033,61 @@ { "name": "time", "unit": "", - "value": 4847.808 + "value": 2254.336 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 20.989200008162488 + "value": 21.250275492212754 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2107172.0 + "value": 10188.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1842532.0 + "value": 1841068.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.571327401354967 + "value": 2.8040939346759925 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2188377.0 + "value": 49335.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102707.0 + "value": 2101558.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.994728403610889 + "value": 37.924088737834445 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.10218675327703637 + "value": 0.44413902657542087 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -38980,13 +42117,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.6465654128516 + "value": 94.3137804701467 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95930380638171 + "value": 99.90154791739802 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -38998,7 +42135,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 5368709120.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -39010,43 +42147,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5460983808.0 + "value": 591396864.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 71303168.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1774190592.0 + "value": 146800640.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 622460928.0 + "value": 228327424.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 31.95483904499311 + "value": 43.23514044683524 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -39058,13 +42195,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 34.89394562623486 + "value": 75.87442715590245 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 4.685466331647747 + "value": 19.487279630861668 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -39076,7 +42213,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 80.9142327953996 + "value": 64.5387993633631 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -39095,30 +42232,47 @@ "time" ], "times": { - "compilation": 15578.485, - "data": 63118.272, - "framework": 1931121.685, - "kernel_overhead": 889656.174, - "profiling_overhead": 52829.147, - "profiling_runs": 925518.092, + "compilation_time": 14639.351, + "data": 56748.026, + "framework": 190240.524, + "kernel_overhead": 31641.076, + "profiling_overhead": 47057.379, + "profiling_runs": 54794.043, "runtimes": [ - 4847.808 + 2254.336 ], - "search_algorithm": 23.399, - "validation": 13.103 + "search_algorithm": 22.247, + "validation": 13.792 }, - "timestamp": "2026-01-27 09:26:47 UTC" + "timestamp": "2026-03-13 09:41:37 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -39126,61 +42280,61 @@ { "name": "time", "unit": "", - "value": 8062.815 + "value": 2056.864 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 13.662295303517746 + "value": 24.216573085574243 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109808.0 + "value": 7608.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1869212.0 + "value": 1839020.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.6419167084044486 + "value": 3.1692640805356103 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2225073.0 + "value": 41330.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102446.0 + "value": 2100668.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.923171201613733 + "value": 21.53412098315904 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.03223627595249105 + "value": 0.2521737363762462 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -39210,13 +42364,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.83085856847794 + "value": 97.82925658236906 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98235545612481 + "value": 99.91173637050002 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -39228,7 +42382,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4831838208.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -39240,43 +42394,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 8467251200.0 + "value": 564133888.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 69206016.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1429209088.0 + "value": 115343360.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 971702272.0 + "value": 192282624.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.531915936216695 + "value": 43.62179114355628 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -39288,13 +42442,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.010514705158197 + "value": 86.15134231801818 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.563735297656503 + "value": 11.23164472603069 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -39306,7 +42460,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.67558084014644 + "value": 61.71213048764038 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -39325,30 +42479,47 @@ "time" ], "times": { - "compilation": 15154.891, - "data": 62996.806, - "framework": 3110732.614, - "kernel_overhead": 1472748.273, - "profiling_overhead": 52629.734, - "profiling_runs": 1522357.801, + "compilation_time": 14393.619, + "data": 57445.284, + "framework": 185436.48799999998, + "kernel_overhead": 29609.286, + "profiling_overhead": 47059.049, + "profiling_runs": 51322.869, "runtimes": [ - 8062.815 + 2056.864 ], - "search_algorithm": 36.095, - "validation": 14.294 + "search_algorithm": 24.077, + "validation": 15.346 }, - "timestamp": "2026-01-27 09:26:49 UTC" + "timestamp": "2026-03-13 09:41:37 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -39356,61 +42527,61 @@ { "name": "time", "unit": "", - "value": 7664.544 + "value": 1860.32 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 13.731781644731031 + "value": 25.678676260661646 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2107708.0 + "value": 3648.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1872004.0 + "value": 1838380.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.6437561464188177 + "value": 3.3680625932498884 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2220904.0 + "value": 34321.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2108929.0 + "value": 2100374.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.92311048743405 + "value": 11.450392130443438 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.032235932608117544 + "value": 0.1340591086334808 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -39440,13 +42611,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.78274125831581 + "value": 98.12224873410426 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98292147150526 + "value": 99.87836335571507 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -39458,7 +42629,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4563402752.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -39470,25 +42641,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 8467251200.0 + "value": 209715200.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 67108864.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1429209088.0 + "value": 558891008.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -39500,13 +42671,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 971702272.0 + "value": 194805760.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.53152234761756 + "value": 37.78229862113663 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -39518,13 +42689,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.010155671182947 + "value": 91.62913943752184 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5637097901157808 + "value": 6.1518587268844005 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -39536,7 +42707,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.6742949526775 + "value": 66.49779298347511 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -39555,30 +42726,47 @@ "time" ], "times": { - "compilation": 14502.069, - "data": 64505.05, - "framework": 3119183.376, - "kernel_overhead": 1475701.685, - "profiling_overhead": 54208.014, - "profiling_runs": 1524768.627, + "compilation_time": 14962.749, + "data": 56615.263, + "framework": 191744.306, + "kernel_overhead": 33397.105, + "profiling_overhead": 47161.535, + "profiling_runs": 54570.403, "runtimes": [ - 7664.544 + 1860.32 ], - "search_algorithm": 23.878, - "validation": 13.762 + "search_algorithm": 26.654, + "validation": 15.135 }, - "timestamp": "2026-01-27 09:26:51 UTC" + "timestamp": "2026-03-13 09:41:37 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -39586,61 +42774,61 @@ { "name": "time", "unit": "", - "value": 7634.368 + "value": 1889.088 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 13.614816096172042 + "value": 26.519925225524993 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102600.0 + "value": 668.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838996.0 + "value": 1835648.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.6393580217228094 + "value": 3.4726713679011216 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2225503.0 + "value": 30048.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100524.0 + "value": 2099333.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.923515814105758 + "value": 5.929496832988967 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.032239502562946006 + "value": 0.06943588205438443 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -39670,13 +42858,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.84582073060159 + "value": 81.65697193869443 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98150231320005 + "value": 99.9145387141772 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -39688,7 +42876,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4429185024.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -39700,43 +42888,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 8467251200.0 + "value": 138412032.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 33554432.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1429209088.0 + "value": 283639808.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 971702272.0 + "value": 165953536.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.53491031913527 + "value": 37.627757938144 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -39748,13 +42936,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.012905627976437 + "value": 94.88421356721027 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5639051605813339 + "value": 3.370520770026634 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -39766,7 +42954,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.68423620312188 + "value": 58.6611058005592 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -39785,30 +42973,47 @@ "time" ], "times": { - "compilation": 14506.503, - "data": 60028.933, - "framework": 3106114.784, - "kernel_overhead": 1473637.759, - "profiling_overhead": 49935.384, - "profiling_runs": 1522512.708, + "compilation_time": 14827.233, + "data": 57999.745, + "framework": 168008.556, + "kernel_overhead": 20752.562, + "profiling_overhead": 47588.496, + "profiling_runs": 41667.753, "runtimes": [ - 7634.368 + 1889.088 ], - "search_algorithm": 24.451, - "validation": 13.449 + "search_algorithm": 27.513, + "validation": 17.332 }, - "timestamp": "2026-01-27 09:26:52 UTC" + "timestamp": "2026-03-13 09:41:37 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 22 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -39816,61 +43021,61 @@ { "name": "time", "unit": "", - "value": 7635.968 + "value": 6320.832 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 13.639965956317976 + "value": 7.368079397501987 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109940.0 + "value": 9072.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838972.0 + "value": 1842412.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.6427176601487028 + "value": 1.0021556079354108 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2233008.0 + "value": 113546.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101938.0 + "value": 2103907.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.9235324518332404 + "value": 26.284241336142838 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 67108864.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.03223920651672742 + "value": 0.30795584875992044 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -39900,13 +43105,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.81359659857272 + "value": 97.56803115678323 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98345348240363 + "value": 99.97026853080243 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -39918,7 +43123,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 6979321856.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -39930,43 +43135,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 8467251200.0 + "value": 10880024576.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 2164260864.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1429209088.0 + "value": 2248146944.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 971702272.0 + "value": 816054272.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.534246092428667 + "value": 43.024014447322585 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -39978,13 +43183,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.01227391352905 + "value": 26.286714520610623 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5638602804777717 + "value": 13.40006345679565 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -39996,7 +43201,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.6819627523962 + "value": 79.9129962001864 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -40015,30 +43220,47 @@ "time" ], "times": { - "compilation": 15969.029, - "data": 63997.246, - "framework": 3115864.824, - "kernel_overhead": 1474473.164, - "profiling_overhead": 53808.02, - "profiling_runs": 1523586.394, + "compilation_time": 15235.46, + "data": 58978.966, + "framework": 2155806.7199999997, + "kernel_overhead": 1001857.754, + "profiling_overhead": 48435.123, + "profiling_runs": 1046534.877, "runtimes": [ - 7635.968 + 6320.832 ], - "search_algorithm": 39.189, - "validation": 19.344 + "search_algorithm": 26.109, + "validation": 15.522 }, - "timestamp": "2026-01-27 09:26:54 UTC" + "timestamp": "2026-03-13 09:41:38 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 25 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -40046,61 +43268,61 @@ { "name": "time", "unit": "", - "value": 7709.216 + "value": 6275.488 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 13.654210623972856 + "value": 7.739954517107159 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2114044.0 + "value": 5400.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1842068.0 + "value": 1837368.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.657946001379558 + "value": 1.0240053525963662 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2239354.0 + "value": 106549.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2110240.0 + "value": 2100552.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 2.9267962703386474 + "value": 13.38369666480172 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.03227501337175236 + "value": 0.15681089417143745 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -40130,13 +43352,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.65158840561398 + "value": 98.5448901933493 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97741817740416 + "value": 99.97378997801259 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -40148,7 +43370,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -40160,43 +43382,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 8467251200.0 + "value": 10854858752.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1429209088.0 + "value": 1392508928.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 971702272.0 + "value": 837287936.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 27.56647047189112 + "value": 36.23303387071551 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -40208,13 +43430,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 22.03805238901035 + "value": 26.769408873210228 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.5656917102543972 + "value": 6.875346224271767 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -40226,7 +43448,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 79.77527892759852 + "value": 83.49794363551302 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -40245,30 +43467,47 @@ "time" ], "times": { - "compilation": 14964.365, - "data": 63909.252, - "framework": 3117832.2010000004, - "kernel_overhead": 1475488.094, - "profiling_overhead": 53769.574, - "profiling_runs": 1524665.281, + "compilation_time": 14429.548, + "data": 57976.449, + "framework": 2737390.726, + "kernel_overhead": 1294037.349, + "profiling_overhead": 47987.103, + "profiling_runs": 1337389.825, "runtimes": [ - 7709.216 + 6275.488 ], - "search_algorithm": 39.457, - "validation": 20.363 + "search_algorithm": 26.528, + "validation": 17.194 }, - "timestamp": "2026-01-27 09:26:56 UTC" + "timestamp": "2026-03-13 09:41:40 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 32, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -40276,61 +43515,61 @@ { "name": "time", "unit": "", - "value": 9789.92 + "value": 8165.056 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 10.935522445645505 + "value": 5.705653798780534 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109256.0 + "value": 1180.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2226124.0 + "value": 1908076.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 80.1826891031837 + "value": 48.710247765670566 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 135660159.0 + "value": 655836.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138424472.0 + "value": 138415628.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.220333204288555 + "value": 5.105256002591709 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012562160356263194 + "value": 0.059622818712423684 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -40360,13 +43599,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.949708421784 + "value": 97.87349357670632 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 98.68000049735441 + "value": 99.68678156820337 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -40378,7 +43617,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -40390,43 +43629,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1883766784.0 + "value": 4250927104.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1262485504.0 + "value": 2722103296.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 289406976.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 259604480.0 + "value": 420413440.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.538320077230029 + "value": 12.26992002256558 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -40438,13 +43677,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.380964923631623 + "value": 20.415199621944623 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.115418265487774 + "value": 12.968835306713844 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -40456,7 +43695,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 16.80944427226605 + "value": 31.973875102014222 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -40475,30 +43714,47 @@ "time" ], "times": { - "compilation": 24373.0, - "data": 61714.977, - "framework": 560610.72, - "kernel_overhead": 191217.135, - "profiling_overhead": 51391.878, - "profiling_runs": 256286.73, + "compilation_time": 14318.138, + "data": 59219.636, + "framework": 922731.371, + "kernel_overhead": 379127.322, + "profiling_overhead": 48793.361, + "profiling_runs": 435591.052, "runtimes": [ - 9789.92 + 8165.056 ], - "search_algorithm": 39.94, - "validation": 25.13 + "search_algorithm": 23.88, + "validation": 13.079 }, - "timestamp": "2026-01-27 09:26:56 UTC" + "timestamp": "2026-03-13 09:41:40 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 34 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -40506,61 +43762,61 @@ { "name": "time", "unit": "", - "value": 9677.28 + "value": 8326.368 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.264475278921504 + "value": 6.1440526996001745 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2111336.0 + "value": 13272.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2230948.0 + "value": 2000892.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 80.13947559938268 + "value": 53.858081345761455 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 135549736.0 + "value": 15966372.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138421311.0 + "value": 138424293.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.1989559719859115 + "value": 2.55987865418965 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012470936415700642 + "value": 0.02970663752605405 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -40590,13 +43846,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.7640648596998 + "value": 90.66351728643592 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 98.19403507692411 + "value": 98.68571798099882 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -40608,7 +43864,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -40620,43 +43876,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1883766784.0 + "value": 3769630720.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1262485504.0 + "value": 2453667840.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 144703488.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 259604480.0 + "value": 379617280.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.52060479811597 + "value": 10.08954102105952 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -40668,13 +43924,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.340142069625575 + "value": 20.549813724505643 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.103403629134187 + "value": 11.754934950321466 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -40686,7 +43942,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 16.769978418321493 + "value": 29.061597716836324 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -40705,30 +43961,47 @@ "time" ], "times": { - "compilation": 26657.644, - "data": 62524.088, - "framework": 566596.002, - "kernel_overhead": 194359.785, - "profiling_overhead": 51958.264, - "profiling_runs": 257753.865, + "compilation_time": 14537.364, + "data": 59228.145, + "framework": 881912.73, + "kernel_overhead": 359250.378, + "profiling_overhead": 48861.977, + "profiling_runs": 414572.23, "runtimes": [ - 9677.28 + 8326.368 ], - "search_algorithm": 39.16, - "validation": 23.568 + "search_algorithm": 24.222, + "validation": 13.499 }, - "timestamp": "2026-01-27 09:26:56 UTC" + "timestamp": "2026-03-13 09:41:41 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -40736,61 +44009,61 @@ { "name": "time", "unit": "", - "value": 9428.192 + "value": 9748.032 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 10.73558874966368 + "value": 5.875093301855567 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2108032.0 + "value": 27268.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2182136.0 + "value": 2199304.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 81.6809738385578 + "value": 76.40114307391784 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 135001939.0 + "value": 118383656.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138416137.0 + "value": 138421197.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.229001380197509 + "value": 1.1016916762538722 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012883497139744111 + "value": 0.012608967315315672 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -40820,13 +44093,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.82340932896282 + "value": 94.40704399367904 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.84469065033247 + "value": 96.4345562299196 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -40838,7 +44111,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -40850,25 +44123,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1883766784.0 + "value": 3528982528.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 536870912.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1262485504.0 + "value": 2319450112.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -40880,13 +44153,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 259604480.0 + "value": 359219200.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.640892681097533 + "value": 7.781312277429116 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -40898,13 +44171,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.617629921254824 + "value": 17.851944414475117 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.18507150148259 + "value": 9.647284902695477 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -40916,7 +44189,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.03832718903714 + "value": 23.88971693390926 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -40935,30 +44208,47 @@ "time" ], "times": { - "compilation": 29102.886, - "data": 61642.534, - "framework": 564268.074, - "kernel_overhead": 194156.797, - "profiling_overhead": 50910.952, - "profiling_runs": 257557.791, + "compilation_time": 14464.738, + "data": 59134.2, + "framework": 874380.028, + "kernel_overhead": 351312.655, + "profiling_overhead": 48976.355, + "profiling_runs": 414956.818, "runtimes": [ - 9428.192 + 9748.032 ], - "search_algorithm": 42.42, - "validation": 30.43 + "search_algorithm": 26.525, + "validation": 16.181 }, - "timestamp": "2026-01-27 09:26:57 UTC" + "timestamp": "2026-03-13 09:41:41 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 26 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -40966,61 +44256,61 @@ { "name": "time", "unit": "", - "value": 9443.232 + "value": 4027.232 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 10.734055176267333 + "value": 12.055901728768042 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2101132.0 + "value": 876.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2177404.0 + "value": 1837948.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 81.82794390184355 + "value": 1.569771493699714 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 134965406.0 + "value": 67125.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138419478.0 + "value": 2100212.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.2454973877280564 + "value": 20.895048302223522 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.012875657362663008 + "value": 0.24477752421219334 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -41038,25 +44328,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.72367393049755 + "value": 98.37864317763598 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 98.99953546308944 + "value": 99.95758539785933 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -41068,7 +44358,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -41080,7 +44370,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1883766784.0 + "value": 5221908480.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -41092,31 +44382,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1262485504.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 2193620992.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 259604480.0 + "value": 535298048.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.701510341592612 + "value": 39.60918691761892 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -41128,13 +44418,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.75721886329138 + "value": 41.79309050527442 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.2261541356683985 + "value": 10.733967580944505 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -41146,7 +44436,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.17334125124835 + "value": 83.34199774108289 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -41165,30 +44455,47 @@ "time" ], "times": { - "compilation": 29248.542, - "data": 60960.411, - "framework": 558851.112, - "kernel_overhead": 192134.326, - "profiling_overhead": 50750.839, - "profiling_runs": 255005.536, + "compilation_time": 14474.966, + "data": 57305.104, + "framework": 1371169.661, + "kernel_overhead": 617559.667, + "profiling_overhead": 47135.82, + "profiling_runs": 649169.07, "runtimes": [ - 9443.232 + 4027.232 ], - "search_algorithm": 38.015, - "validation": 25.489 + "search_algorithm": 25.636, + "validation": 15.422 }, - "timestamp": "2026-01-27 09:26:57 UTC" + "timestamp": "2026-03-13 09:41:42 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 30 + }, "configuration": { - "INNER_UNROLL_FACTOR": "2", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -41196,61 +44503,61 @@ { "name": "time", "unit": "", - "value": 9193.024 + "value": 4440.928 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 11.908575959719515 + "value": 10.686826090508086 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2117384.0 + "value": 4596.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 2169368.0 + "value": 1837464.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 78.05108536324727 + "value": 1.464399188039428 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 114171912.0 + "value": 75962.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 138425752.0 + "value": 2100235.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.2844883431845513 + "value": 9.818107575907034 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.013123128608539977 + "value": 0.11502705143627252 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -41268,25 +44575,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 136314880.0 + "value": 0.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 92.51680723017081 + "value": 98.76863159726551 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 97.95755151415358 + "value": 99.96821058983706 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -41298,7 +44605,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -41310,7 +44617,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 1883766784.0 + "value": 5863636992.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -41322,31 +44629,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 1262485504.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 106954752.0 + "value": 1373634560.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 259604480.0 + "value": 621477888.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.933366896392122 + "value": 43.263768865325616 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -41358,13 +44665,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 18.291029787806693 + "value": 39.27505219768252 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.383260842090079 + "value": 5.120331512100211 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -41376,7 +44683,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 17.689601771545796 + "value": 90.92958069648898 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -41395,30 +44702,47 @@ "time" ], "times": { - "compilation": 28271.118, - "data": 62138.935, - "framework": 558314.538, - "kernel_overhead": 191179.425, - "profiling_overhead": 51750.962, - "profiling_runs": 253245.216, + "compilation_time": 14019.708, + "data": 57868.448, + "framework": 1880982.1069999998, + "kernel_overhead": 870756.357, + "profiling_overhead": 47808.682, + "profiling_runs": 904548.62, "runtimes": [ - 9193.024 + 4440.928 ], - "search_algorithm": 37.601, - "validation": 25.794 + "search_algorithm": 27.175, + "validation": 14.517 }, - "timestamp": "2026-01-27 09:26:57 UTC" + "timestamp": "2026-03-13 09:41:43 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -41426,61 +44750,61 @@ { "name": "time", "unit": "", - "value": 2966.624 + "value": 6943.744 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 35.66004672897196 + "value": 7.000180964040381 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104880.0 + "value": 6480.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1870696.0 + "value": 1837912.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.177706327812932 + "value": 0.9307740076988268 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2140858.0 + "value": 117824.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2104219.0 + "value": 2101122.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 14.740371648802647 + "value": 3.041058234763193 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.16744021009603968 + "value": 0.03563319075127311 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -41510,13 +44834,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.70489716966954 + "value": 98.82274779127494 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97244098236769 + "value": 99.98330455390612 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -41528,7 +44852,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -41540,43 +44864,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2772434944.0 + "value": 8738832384.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1912602624.0 + "value": 1157627904.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 400162816.0 + "value": 970653696.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 40.699318539032156 + "value": 36.47670100773274 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -41588,13 +44912,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 57.16868017276385 + "value": 24.32965349705354 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.67645851929202 + "value": 1.6334606229711237 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -41606,7 +44930,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.22343496315877 + "value": 87.9755650880638 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -41625,30 +44949,47 @@ "time" ], "times": { - "compilation": 26849.438, - "data": 60048.03, - "framework": 1195260.8769999999, - "kernel_overhead": 529278.249, - "profiling_overhead": 49879.533, - "profiling_runs": 556055.065, + "compilation_time": 15138.729, + "data": 59506.132, + "framework": 3110194.699, + "kernel_overhead": 1478032.542, + "profiling_overhead": 48921.035, + "profiling_runs": 1523734.99, "runtimes": [ - 2966.624 + 6943.744 ], - "search_algorithm": 38.907, - "validation": 19.725 + "search_algorithm": 24.905, + "validation": 15.98 }, - "timestamp": "2026-01-27 09:26:58 UTC" + "timestamp": "2026-03-13 09:41:44 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -41656,61 +44997,61 @@ { "name": "time", "unit": "", - "value": 2886.016 + "value": 9602.752 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 35.805177709521715 + "value": 6.071388025290809 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098012.0 + "value": 21956.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1871012.0 + "value": 2202756.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.19440826715859 + "value": 81.81003479648997 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2134975.0 + "value": 132987295.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102794.0 + "value": 138423217.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 14.739880005315436 + "value": 1.105124030747497 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.16742143468506704 + "value": 0.01273611696398749 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -41728,25 +45069,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.40915630274817 + "value": 98.73289631300591 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.94720414685509 + "value": 97.3080410621822 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -41758,7 +45099,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -41770,43 +45111,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2772434944.0 + "value": 1918369792.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 1245708288.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1912602624.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 400162816.0 + "value": 258555904.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 40.70470998339882 + "value": 7.787644490364035 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -41818,13 +45159,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 57.176703300810686 + "value": 17.870100803953324 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.6775358436147165 + "value": 5.189571510327753 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -41836,7 +45177,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.23543345949689 + "value": 17.212721335644595 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -41855,30 +45196,47 @@ "time" ], "times": { - "compilation": 28868.826, - "data": 60256.597, - "framework": 1190889.8849999998, - "kernel_overhead": 527215.026, - "profiling_overhead": 49713.647, - "profiling_runs": 553704.615, + "compilation_time": 14593.176, + "data": 59167.133, + "framework": 549645.456, + "kernel_overhead": 189479.949, + "profiling_overhead": 49145.34, + "profiling_runs": 251853.034, "runtimes": [ - 2886.016 + 9602.752 ], - "search_algorithm": 41.832, - "validation": 20.333 + "search_algorithm": 25.729, + "validation": 18.683 }, - "timestamp": "2026-01-27 09:26:59 UTC" + "timestamp": "2026-03-13 09:41:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -41886,61 +45244,61 @@ { "name": "time", "unit": "", - "value": 2898.88 + "value": 3219.712 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 35.260755009378286 + "value": 16.924268051028303 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2106328.0 + "value": 6504.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839988.0 + "value": 1841052.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.2016259725254805 + "value": 2.187158297160336 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2156387.0 + "value": 54562.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2104599.0 + "value": 2105839.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 14.742907744006292 + "value": 14.858587646059554 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.16747046949653782 + "value": 0.17405341986711853 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -41970,13 +45328,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.71818221306063 + "value": 98.66815644297495 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.94875301247184 + "value": 99.9526478432454 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -41988,7 +45346,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -42000,7 +45358,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2772434944.0 + "value": 3042967552.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -42012,13 +45370,13 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1912602624.0 + "value": 1644167168.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -42030,13 +45388,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 400162816.0 + "value": 399179776.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 40.71647093311248 + "value": 45.942489498294584 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -42048,13 +45406,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 57.1925630538068 + "value": 59.43837933586531 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.679665449119565 + "value": 7.749046524744159 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -42066,7 +45424,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.25903697901195 + "value": 88.38928239680611 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -42085,30 +45443,47 @@ "time" ], "times": { - "compilation": 20729.216, - "data": 60549.092, - "framework": 1193017.8939999999, - "kernel_overhead": 527932.318, - "profiling_overhead": 50252.532, - "profiling_runs": 554283.952, + "compilation_time": 14330.999, + "data": 57768.981, + "framework": 1147520.1160000002, + "kernel_overhead": 507998.857, + "profiling_overhead": 47398.216, + "profiling_runs": 534354.062, "runtimes": [ - 2898.88 + 3219.712 ], - "search_algorithm": 28.737, - "validation": 15.859 + "search_algorithm": 29.875, + "validation": 15.517 }, - "timestamp": "2026-01-27 09:26:59 UTC" + "timestamp": "2026-03-13 09:41:45 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -42116,61 +45491,61 @@ { "name": "time", "unit": "", - "value": 2950.72 + "value": 3507.456 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 34.539990131431395 + "value": 13.838928399228545 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103204.0 + "value": 212.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839196.0 + "value": 1836712.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.203632938864868 + "value": 1.7953836291584802 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2154156.0 + "value": 57069.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101096.0 + "value": 2099194.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 14.7436151826913 + "value": 6.037402553359811 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.16746220754929117 + "value": 0.07073180769904905 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -42200,13 +45575,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.54821621641521 + "value": 98.63063677973165 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9473403466991 + "value": 99.96802627863947 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -42218,7 +45593,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -42230,7 +45605,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2772434944.0 + "value": 3301965824.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -42242,31 +45617,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1912602624.0 + "value": 1429209088.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 400162816.0 + "value": 513540096.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 40.71483977423616 + "value": 42.22264037772692 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -42278,13 +45653,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 57.190549855432806 + "value": 48.30169123738616 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.6793951221894625 + "value": 3.2429113989944316 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -42296,7 +45671,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.25607503232293 + "value": 92.40604812044778 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -42315,30 +45690,47 @@ "time" ], "times": { - "compilation": 20025.926, - "data": 61011.849, - "framework": 1190394.786, - "kernel_overhead": 526293.503, - "profiling_overhead": 50223.043, - "profiling_runs": 552866.391, + "compilation_time": 14736.112, + "data": 57239.539, + "framework": 1830184.501, + "kernel_overhead": 848352.441, + "profiling_overhead": 47135.662, + "profiling_runs": 877456.859, "runtimes": [ - 2950.72 + 3507.456 ], - "search_algorithm": 27.173, - "validation": 13.932 + "search_algorithm": 26.572, + "validation": 16.527 }, - "timestamp": "2026-01-27 09:27:0 UTC" + "timestamp": "2026-03-13 09:41:46 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 56 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "8" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -42346,61 +45738,61 @@ { "name": "time", "unit": "", - "value": 2946.08 + "value": 6449.856 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 35.3441273720529 + "value": 8.55849054369323 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097376.0 + "value": 10004.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1836284.0 + "value": 1840076.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.183121075111511 + "value": 1.1421725460446115 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2144514.0 + "value": 102630.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098983.0 + "value": 2110951.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 14.76507042992011 + "value": 1.8628182807288678 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 17301504.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.1677252434476355 + "value": 0.021825045546994465 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -42430,13 +45822,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.13349687939628 + "value": 73.8672618369765 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.948972566863 + "value": 99.97344823713026 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -42448,7 +45840,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4982833152.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -42460,7 +45852,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2772434944.0 + "value": 4806148096.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -42472,31 +45864,31 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 570425344.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1912602624.0 + "value": 1222115328.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 6291456.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 400162816.0 + "value": 868564992.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 40.77931219896015 + "value": 42.34274455283249 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -42508,13 +45900,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 57.2794446604516 + "value": 29.806376305186394 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.691331680480563 + "value": 1.0587958379894093 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -42526,7 +45918,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.38855498145833 + "value": 96.44376418488963 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -42545,30 +45937,47 @@ "time" ], "times": { - "compilation": 18314.67, - "data": 61898.993, - "framework": 1192685.182, - "kernel_overhead": 526408.847, - "profiling_overhead": 51698.838, - "profiling_runs": 552678.504, + "compilation_time": 15065.145, + "data": 57998.98, + "framework": 3294262.958, + "kernel_overhead": 1574050.303, + "profiling_overhead": 47804.926, + "profiling_runs": 1614408.749, "runtimes": [ - 2946.08 + 6449.856 ], - "search_algorithm": 30.96, - "validation": 21.332 + "search_algorithm": 25.279, + "validation": 15.74 }, - "timestamp": "2026-01-27 09:27:1 UTC" + "timestamp": "2026-03-13 09:41:48 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -42576,61 +45985,61 @@ { "name": "time", "unit": "", - "value": 3838.592 + "value": 3364.928 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.339613493641053 + "value": 14.354911920052817 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2107540.0 + "value": 208.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1871288.0 + "value": 1836596.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.241809909109633 + "value": 1.8597127997880867 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2160315.0 + "value": 55042.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2105221.0 + "value": 2099177.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.920341785126696 + "value": 6.226595269392439 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.0652788986225823 + "value": 0.07294408018245042 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -42660,13 +46069,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.63103645215946 + "value": 98.68898173548146 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95324780051602 + "value": 99.95521727369007 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -42678,7 +46087,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -42690,25 +46099,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2963275776.0 + "value": 2563768320.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1764753408.0 + "value": 1362100224.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -42720,13 +46129,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 514490368.0 + "value": 484179968.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.11179792696698 + "value": 42.769914651694236 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -42738,13 +46147,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.584572394574515 + "value": 49.81880228909951 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.1675074626028277 + "value": 3.3447682200933504 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -42756,7 +46165,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.45255218161616 + "value": 89.8595043679454 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -42775,30 +46184,47 @@ "time" ], "times": { - "compilation": 16263.629, - "data": 59758.113, - "framework": 1963097.582, - "kernel_overhead": 911511.945, - "profiling_overhead": 49702.323, - "profiling_runs": 942125.201, + "compilation_time": 14205.339, + "data": 59531.672, + "framework": 1710707.102, + "kernel_overhead": 786875.407, + "profiling_overhead": 49288.46, + "profiling_runs": 815011.563, "runtimes": [ - 3838.592 + 3364.928 ], - "search_algorithm": 24.409, - "validation": 16.34 + "search_algorithm": 24.27, + "validation": 13.345 }, - "timestamp": "2026-01-27 09:27:2 UTC" + "timestamp": "2026-03-13 09:41:49 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -42806,61 +46232,61 @@ { "name": "time", "unit": "", - "value": 3798.944 + "value": 6389.312 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.464091734706475 + "value": 7.66230961218482 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2107936.0 + "value": 932.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1879024.0 + "value": 1838140.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.275547742467619 + "value": 1.0171856882710093 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2158330.0 + "value": 105190.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2104737.0 + "value": 2103599.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.920933497043442 + "value": 1.6605635350152728 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06527046582067546 + "value": 0.019455713741598483 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -42890,13 +46316,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.58737933958763 + "value": 81.99569970983828 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.94379863912566 + "value": 99.97164588593354 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -42908,7 +46334,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -42920,43 +46346,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2963275776.0 + "value": 7219970048.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1764753408.0 + "value": 551026688.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 514490368.0 + "value": 952385536.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.110903762826716 + "value": 43.242688095550214 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -42968,13 +46394,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.58302760181231 + "value": 26.571068486263066 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.167397712921724 + "value": 0.9438697423709171 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -42986,7 +46412,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.4496192730343 + "value": 94.2721564537331 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -43005,30 +46431,47 @@ "time" ], "times": { - "compilation": 16113.744, - "data": 61520.031, - "framework": 1960876.639, - "kernel_overhead": 908802.276, - "profiling_overhead": 51127.168, - "profiling_runs": 939427.164, + "compilation_time": 14843.849, + "data": 57960.061, + "framework": 3659701.4299999997, + "kernel_overhead": 1755522.939, + "profiling_overhead": 47905.839, + "profiling_runs": 1798312.591, "runtimes": [ - 3798.944 + 6389.312 ], - "search_algorithm": 34.301, - "validation": 17.734 + "search_algorithm": 26.867, + "validation": 15.17 }, - "timestamp": "2026-01-27 09:27:3 UTC" + "timestamp": "2026-03-13 09:41:51 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 64, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 4, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 16, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 4, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -43036,61 +46479,61 @@ { "name": "time", "unit": "", - "value": 3888.576 + "value": 6066.144 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.160272538531917 + "value": 8.078333073531377 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2109376.0 + "value": 17556.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1842448.0 + "value": 1839804.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.247741266103399 + "value": 1.0685345809264712 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2173287.0 + "value": 116344.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103834.0 + "value": 2106656.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.921824139251356 + "value": 1.7421343847696686 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06529470906153788 + "value": 0.020411216242384785 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -43120,13 +46563,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.701558355762 + "value": 81.9838790862558 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96346883110196 + "value": 99.97219629985578 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -43138,7 +46581,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -43150,43 +46593,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2963275776.0 + "value": 5944901632.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 101187584.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1764753408.0 + "value": 1054343168.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 514490368.0 + "value": 913571840.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.116625809974 + "value": 44.059706995924756 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -43198,13 +46641,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.59081093046387 + "value": 27.8758644313618 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.167950678897701 + "value": 0.9902193053620951 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -43216,7 +46659,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.46450588359913 + "value": 94.98142745324343 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -43235,30 +46678,47 @@ "time" ], "times": { - "compilation": 14921.683, - "data": 61362.958, - "framework": 1956074.3960000002, - "kernel_overhead": 906600.312, - "profiling_overhead": 50826.795, - "profiling_runs": 937284.331, + "compilation_time": 14149.003, + "data": 58181.702, + "framework": 3570319.691, + "kernel_overhead": 1711376.026, + "profiling_overhead": 47982.46, + "profiling_runs": 1752779.503, "runtimes": [ - 3888.576 + 6066.144 ], - "search_algorithm": 22.67, - "validation": 14.919 + "search_algorithm": 26.012, + "validation": 15.612 }, - "timestamp": "2026-01-27 09:27:4 UTC" + "timestamp": "2026-03-13 09:41:53 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 256 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 1 }, "correctness": 1, "invalidity": "correct", @@ -43266,61 +46726,61 @@ { "name": "time", "unit": "", - "value": 4065.76 + "value": 6957.568 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.34573177762404 + "value": 6.798203563933408 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2105048.0 + "value": 9156.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1841348.0 + "value": 1840112.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.2851480992681577 + "value": 0.9238513861004161 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2170344.0 + "value": 120875.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2105011.0 + "value": 2105704.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.921794866539238 + "value": 48.267919611155804 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 134217728.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06529511500293424 + "value": 0.565533146761012 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -43350,13 +46810,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.62796612792587 + "value": 93.0287514508357 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9679176905339 + "value": 99.9699135879803 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -43368,7 +46828,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 8589934592.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -43380,43 +46840,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2963275776.0 + "value": 2952790016.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 4311744512.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1764753408.0 + "value": 318767104.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 50331648.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 514490368.0 + "value": 527958016.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.1153806607518 + "value": 21.49664130092607 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -43428,13 +46888,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.589103722914324 + "value": 24.136676121628987 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.16782939047072 + "value": 24.51381168602944 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -43446,7 +46906,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.4612650104988 + "value": 47.47238731087597 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -43465,30 +46925,47 @@ "time" ], "times": { - "compilation": 14331.455, - "data": 64541.018, - "framework": 1964715.099, - "kernel_overhead": 907647.137, - "profiling_overhead": 54267.519, - "profiling_runs": 938259.425, + "compilation_time": 13997.903, + "data": 58256.998, + "framework": 501594.5850000001, + "kernel_overhead": 173866.888, + "profiling_overhead": 48243.893, + "profiling_runs": 221226.806, "runtimes": [ - 4065.76 + 6957.568 ], - "search_algorithm": 24.14, - "validation": 17.987 + "search_algorithm": 25.734, + "validation": 14.608 }, - "timestamp": "2026-01-27 09:27:5 UTC" + "timestamp": "2026-03-13 09:41:53 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 39 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -43496,61 +46973,61 @@ { "name": "time", "unit": "", - "value": 3784.672 + "value": 3682.016 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 27.450228255431675 + "value": 13.13378445464648 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097368.0 + "value": 5336.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1835648.0 + "value": 1840236.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.2591998329029312 + "value": 1.7236755193112634 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2157472.0 + "value": 68993.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098982.0 + "value": 2108100.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 5.907904367166177 + "value": 45.73022081998979 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 67108864.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.06514314686991102 + "value": 0.5356231863858517 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -43580,13 +47057,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.35872783046771 + "value": 93.63457191257216 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95623377054027 + "value": 99.94147683011808 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -43598,7 +47075,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 6442450944.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -43610,43 +47087,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2963275776.0 + "value": 1509949440.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 142606336.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 2164260864.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1764753408.0 + "value": 184549376.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 514490368.0 + "value": 333971456.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 33.04199130385748 + "value": 28.985227077057633 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -43658,13 +47135,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 44.49052675588708 + "value": 45.733276468005954 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.1608259975495954 + "value": 23.31325226201085 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -43676,7 +47153,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.27231650751766 + "value": 56.89926639543397 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -43695,30 +47172,47 @@ "time" ], "times": { - "compilation": 14841.792, - "data": 65182.277, - "framework": 1965264.793, - "kernel_overhead": 907449.779, - "profiling_overhead": 54901.21, - "profiling_runs": 937731.527, + "compilation_time": 14604.395, + "data": 57284.626, + "framework": 316094.765, + "kernel_overhead": 90851.945, + "profiling_overhead": 47107.972, + "profiling_runs": 120850.222, "runtimes": [ - 3784.672 + 3682.016 ], - "search_algorithm": 24.233, - "validation": 15.707 + "search_algorithm": 32.486, + "validation": 15.322 }, - "timestamp": "2026-01-27 09:27:6 UTC" + "timestamp": "2026-03-13 09:41:53 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -43726,61 +47220,61 @@ { "name": "time", "unit": "", - "value": 6370.112 + "value": 2280.896 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.449185111144104 + "value": 21.660699479912566 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104108.0 + "value": 1816.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1866520.0 + "value": 1837368.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9529420203883165 + "value": 2.7814731422150176 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2196502.0 + "value": 37855.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100436.0 + "value": 2099679.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.8561652782151044 + "value": 37.987090510374344 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019333251414554282 + "value": 0.4449089774995648 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -43810,13 +47304,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 73.8187109177256 + "value": 95.52295814422723 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98806648055056 + "value": 99.91324825960773 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -43828,7 +47322,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 5368709120.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -43840,43 +47334,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4400349184.0 + "value": 591396864.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 71303168.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1626341376.0 + "value": 146800640.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 869564416.0 + "value": 228327424.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.09031275275254 + "value": 43.3051003511832 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -43888,13 +47382,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.3994829854414 + "value": 75.99706093293204 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.0408975835324183 + "value": 19.518776391954223 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -43906,7 +47400,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.51839862242483 + "value": 64.6431116728929 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -43925,30 +47419,47 @@ "time" ], "times": { - "compilation": 14712.604, - "data": 62591.227, - "framework": 3144707.04, - "kernel_overhead": 1493517.207, - "profiling_overhead": 52276.215, - "profiling_runs": 1536322.391, + "compilation_time": 14354.838, + "data": 57959.172, + "framework": 203667.863, + "kernel_overhead": 37275.716, + "profiling_overhead": 48095.971, + "profiling_runs": 60337.004, "runtimes": [ - 6370.112 + 2280.896 ], - "search_algorithm": 42.298, - "validation": 19.184 + "search_algorithm": 38.689, + "validation": 17.502 }, - "timestamp": "2026-01-27 09:27:8 UTC" + "timestamp": "2026-03-13 09:41:53 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -43956,61 +47467,61 @@ { "name": "time", "unit": "", - "value": 6331.552 + "value": 1967.104 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.43884281745106 + "value": 25.006088650754993 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098996.0 + "value": 2332.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1869388.0 + "value": 1837604.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9507855209856535 + "value": 3.2050074045514734 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2191896.0 + "value": 33843.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100560.0 + "value": 2099273.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.856128770695742 + "value": 21.78864540604333 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01933204755079571 + "value": 0.2551985966569358 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -44040,13 +47551,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 73.78224769913786 + "value": 97.00789740239885 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98102006458238 + "value": 99.9007219204442 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -44058,7 +47569,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4831838208.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -44070,43 +47581,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4400349184.0 + "value": 564133888.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 69206016.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1626341376.0 + "value": 115343360.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 869564416.0 + "value": 192282624.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.09055259183708 + "value": 44.150110665681744 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -44118,13 +47629,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.399699568614977 + "value": 87.19435253758117 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.0409061231277634 + "value": 11.367623109147544 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -44136,7 +47647,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.51910022124717 + "value": 62.45926200106487 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -44155,30 +47666,47 @@ "time" ], "times": { - "compilation": 14492.703, - "data": 63589.396, - "framework": 3153478.743, - "kernel_overhead": 1496769.331, - "profiling_overhead": 53709.176, - "profiling_runs": 1539410.84, + "compilation_time": 14882.971, + "data": 56807.149, + "framework": 185344.615, + "kernel_overhead": 29928.914, + "profiling_overhead": 47267.7, + "profiling_runs": 51340.852, "runtimes": [ - 6331.552 + 1967.104 ], - "search_algorithm": 26.842, - "validation": 16.854 + "search_algorithm": 27.15, + "validation": 13.97 }, - "timestamp": "2026-01-27 09:27:9 UTC" + "timestamp": "2026-03-13 09:41:54 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 40 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -44186,61 +47714,61 @@ { "name": "time", "unit": "", - "value": 6372.064 + "value": 1943.488 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.327073455945524 + "value": 26.74439829972468 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2102600.0 + "value": 1560.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1838092.0 + "value": 1838660.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9571882812544885 + "value": 3.4327579852753622 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2205364.0 + "value": 32040.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100396.0 + "value": 2103086.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.856600755693582 + "value": 11.652994098161274 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019335631600577715 + "value": 0.13645152306461591 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -44270,13 +47798,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 73.83099396329749 + "value": 98.10847554544715 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9757556191111 + "value": 99.91000326359884 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -44288,7 +47816,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4563402752.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -44300,43 +47828,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4400349184.0 + "value": 209715200.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 67108864.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1626341376.0 + "value": 558891008.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 869564416.0 + "value": 194805760.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.097585146907683 + "value": 38.44572064024574 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -44348,13 +47876,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.40598431273635 + "value": 93.23481470253309 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.0411539224870412 + "value": 6.259661631639794 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -44366,7 +47894,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.53945900074706 + "value": 67.66308274317251 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -44385,30 +47913,47 @@ "time" ], "times": { - "compilation": 14267.295, - "data": 62580.347, - "framework": 3144320.111, - "kernel_overhead": 1493150.313, - "profiling_overhead": 52480.345, - "profiling_runs": 1536109.106, + "compilation_time": 15315.73, + "data": 57706.358, + "framework": 194294.569, + "kernel_overhead": 34080.039, + "profiling_overhead": 47442.109, + "profiling_runs": 55066.063, "runtimes": [ - 6372.064 + 1943.488 ], - "search_algorithm": 22.745, - "validation": 14.575 + "search_algorithm": 26.067, + "validation": 13.637 }, - "timestamp": "2026-01-27 09:27:11 UTC" + "timestamp": "2026-03-13 09:41:54 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 0, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -44416,61 +47961,61 @@ { "name": "time", "unit": "", - "value": 6562.272 + "value": 1858.496 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.12008079228564 + "value": 26.82262485059017 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2115160.0 + "value": 932.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1843416.0 + "value": 1837392.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.948877044205339 + "value": 3.506110936473611 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2221016.0 + "value": 29256.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2110732.0 + "value": 2099466.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.856599930220589 + "value": 5.961696163552753 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01933561760492877 + "value": 0.06975668495887376 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -44500,13 +48045,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 73.82010719685273 + "value": 81.30424595723531 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98068778875184 + "value": 99.82342555279108 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -44518,7 +48063,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4429185024.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -44530,25 +48075,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4400349184.0 + "value": 138412032.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 33554432.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1626341376.0 + "value": 283639808.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -44560,13 +48105,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 869564416.0 + "value": 165953536.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.09599390676621 + "value": 37.842544517341395 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -44578,13 +48123,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.40466256080919 + "value": 95.4095961641935 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.041101807512374 + "value": 3.389183652805213 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -44596,7 +48141,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.53517732188807 + "value": 58.98591772604958 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -44615,30 +48160,47 @@ "time" ], "times": { - "compilation": 14239.689, - "data": 63828.637, - "framework": 3151802.0990000004, - "kernel_overhead": 1495569.247, - "profiling_overhead": 53765.617, - "profiling_runs": 1538638.598, + "compilation_time": 14806.173, + "data": 57020.7, + "framework": 167731.686, + "kernel_overhead": 21357.409, + "profiling_overhead": 47219.056, + "profiling_runs": 42134.521, "runtimes": [ - 6562.272 + 1858.496 ], - "search_algorithm": 24.41, - "validation": 17.135 + "search_algorithm": 22.544, + "validation": 14.879 }, - "timestamp": "2026-01-27 09:27:12 UTC" + "timestamp": "2026-03-13 09:41:54 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 128 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 22 + }, "configuration": { - "INNER_UNROLL_FACTOR": "4", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 2 }, "correctness": 1, "invalidity": "correct", @@ -44646,61 +48208,61 @@ { "name": "time", "unit": "", - "value": 6332.384 + "value": 6408.896 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.298597930164252 + "value": 7.609012622964633 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097596.0 + "value": 10012.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837308.0 + "value": 1839448.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9537876565451893 + "value": 1.0078509339919588 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2200258.0 + "value": 112793.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099175.0 + "value": 2104546.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.8560269515111565 + "value": 26.221748235228105 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 67108864.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019330132870752515 + "value": 0.30720498248080963 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -44730,13 +48292,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 65.67779864241542 + "value": 97.28208326817835 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97334093801861 + "value": 99.96462344967408 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -44748,7 +48310,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 6979321856.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -44760,43 +48322,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 4400349184.0 + "value": 10880024576.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 268435456.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 2164260864.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1626341376.0 + "value": 2248146944.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 25165824.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 869564416.0 + "value": 816054272.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.089959678056093 + "value": 42.92140548617156 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -44808,13 +48370,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.399112501965206 + "value": 26.2241023544627 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.0408829758465283 + "value": 13.368145926786651 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -44826,7 +48388,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 85.51716307549803 + "value": 79.72266715843195 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -44845,30 +48407,47 @@ "time" ], "times": { - "compilation": 14260.869, - "data": 60529.955, - "framework": 3138992.471, - "kernel_overhead": 1492675.191, - "profiling_overhead": 50471.71, - "profiling_runs": 1535315.615, + "compilation_time": 14990.5, + "data": 58655.271, + "framework": 2150695.997, + "kernel_overhead": 998982.264, + "profiling_overhead": 48983.972, + "profiling_runs": 1044074.49, "runtimes": [ - 6332.384 + 6408.896 ], - "search_algorithm": 24.349, - "validation": 14.094 + "search_algorithm": 27.353, + "validation": 19.129 }, - "timestamp": "2026-01-27 09:27:14 UTC" + "timestamp": "2026-03-13 09:41:55 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 25 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -44876,61 +48455,61 @@ { "name": "time", "unit": "", - "value": 3533.888 + "value": 6281.056 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 29.862638618817506 + "value": 7.773107973778948 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2101876.0 + "value": 12280.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1868928.0 + "value": 1839436.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.539076620707305 + "value": 1.026547350431687 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2146857.0 + "value": 112261.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101046.0 + "value": 2101595.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.404053576823852 + "value": 13.377647005925072 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.07060906756497091 + "value": 0.15672075092809026 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -44960,13 +48539,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.64574348147805 + "value": 98.14531652982855 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9598789098737 + "value": 99.96198889272925 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -44978,7 +48557,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -44990,43 +48569,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2091909120.0 + "value": 10854858752.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1830813696.0 + "value": 1392508928.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 485130240.0 + "value": 837287936.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 36.572416707598734 + "value": 36.21613337334592 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -45038,13 +48617,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.22180390442517 + "value": 26.75717885835255 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.4259142910614564 + "value": 6.8722051169401555 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -45056,7 +48635,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.14956039963747 + "value": 83.45982163809602 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -45075,30 +48654,47 @@ "time" ], "times": { - "compilation": 14132.846, - "data": 59400.474, - "framework": 1729719.79, - "kernel_overhead": 795972.029, - "profiling_overhead": 49342.741, - "profiling_runs": 825004.546, + "compilation_time": 14650.555, + "data": 58076.484, + "framework": 2724508.1679999996, + "kernel_overhead": 1287551.892, + "profiling_overhead": 47993.258, + "profiling_runs": 1330886.534, "runtimes": [ - 3533.888 + 6281.056 ], - "search_algorithm": 25.714, - "validation": 16.771 + "search_algorithm": 26.515, + "validation": 15.489 }, - "timestamp": "2026-01-27 09:27:15 UTC" + "timestamp": "2026-03-13 09:41:56 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 32, + "registers": 27 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -45106,61 +48702,61 @@ { "name": "time", "unit": "", - "value": 3620.704 + "value": 8149.536 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 29.173632111797744 + "value": 5.761415962054825 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2105684.0 + "value": 5744.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1870444.0 + "value": 1911316.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.50687303374669 + "value": 48.82877785060414 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2153353.0 + "value": 610370.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100801.0 + "value": 138420321.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.403246636950552 + "value": 5.187229761894996 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.0706025358766947 + "value": 0.05995827951844348 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -45178,25 +48774,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.5720815345859 + "value": 96.42944312491501 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.953246576139 + "value": 98.73024022029983 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -45208,7 +48804,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -45220,43 +48816,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2091909120.0 + "value": 4250927104.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 2722103296.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1830813696.0 + "value": 289406976.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 485130240.0 + "value": 420413440.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 36.57138572373798 + "value": 12.45976285165539 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -45268,13 +48864,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.220542579817334 + "value": 20.72896750103733 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.4258246803532337 + "value": 13.168157577563267 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -45286,7 +48882,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.14730707907763 + "value": 32.465281928429185 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -45305,260 +48901,47 @@ "time" ], "times": { - "compilation": 14166.068, - "data": 65583.281, - "framework": 1738984.213, - "kernel_overhead": 794347.611, - "profiling_overhead": 55366.733, - "profiling_runs": 823686.588, + "compilation_time": 14489.998, + "data": 58602.065, + "framework": 911871.318, + "kernel_overhead": 374904.639, + "profiling_overhead": 48598.67, + "profiling_runs": 429765.944, "runtimes": [ - 3620.704 + 8149.536 ], - "search_algorithm": 24.735, - "validation": 16.222 + "search_algorithm": 26.47, + "validation": 15.813 }, - "timestamp": "2026-01-27 09:27:16 UTC" + "timestamp": "2026-03-13 09:41:57 UTC" }, { - "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 64, + "registers": 34 }, - "correctness": 1, - "invalidity": "correct", - "measurements": [ - { - "name": "time", - "unit": "", - "value": 3491.936 - }, - { - "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 29.6207020643571 - }, - { - "name": "dram__sectors_read.sum", - "type": "Double", - "unit": "", - "value": 2099228.0 - }, - { - "name": "dram__sectors_write.sum", - "type": "Double", - "unit": "", - "value": 1838496.0 - }, - { - "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 3.552254020274117 - }, - { - "name": "lts__t_sectors_op_read.sum", - "type": "Double", - "unit": "", - "value": 2157060.0 - }, - { - "name": "lts__t_sectors_op_write.sum", - "type": "Double", - "unit": "", - "value": 2100795.0 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 6.406146015934075 - }, - { - "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", - "type": "Double", - "unit": "", - "value": 8912896.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 0.07062818273708808 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "sm__warps_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 98.7006498248347 - }, - { - "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", - "type": "Double", - "unit": "", - "value": 99.95836429686736 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", - "type": "Double", - "unit": "", - "value": 4647288832.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", - "type": "Double", - "unit": "", - "value": 2091909120.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", - "type": "Double", - "unit": "", - "value": 134217728.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", - "type": "Double", - "unit": "", - "value": 301989888.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", - "type": "Double", - "unit": "", - "value": 1830813696.0 - }, - { - "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", - "type": "Double", - "unit": "", - "value": 3145728.0 - }, - { - "name": "smsp__inst_executed.sum", - "type": "Double", - "unit": "", - "value": 485130240.0 - }, - { - "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 36.583059621888495 - }, - { - "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 48.23558930862095 - }, - { - "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 3.4268936740255604 - }, - { - "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 0.0 - }, - { - "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", - "type": "Double", - "unit": "", - "value": 87.17447373392126 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.ratio", - "type": "Double", - "unit": "", - "value": 32.0 - }, - { - "name": "smsp__thread_inst_executed_per_inst_executed.pct", - "type": "Double", - "unit": "", - "value": 100.0 - } - ], - "objectives": [ - "time" - ], - "times": { - "compilation": 14245.257, - "data": 62099.242, - "framework": 1736895.947, - "kernel_overhead": 796987.07, - "profiling_overhead": 51776.163, - "profiling_runs": 826033.472, - "runtimes": [ - 3491.936 - ], - "search_algorithm": 35.401, - "validation": 16.643 - }, - "timestamp": "2026-01-27 09:27:17 UTC" - }, - { "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -45566,61 +48949,61 @@ { "name": "time", "unit": "", - "value": 3511.808 + "value": 8238.911 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 29.588595410442014 + "value": 6.361084892454626 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097336.0 + "value": 30340.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837256.0 + "value": 2005484.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.5659581226574644 + "value": 51.713917583884076 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2153183.0 + "value": 8368787.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2098982.0 + "value": 138421550.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.4058082535730945 + "value": 2.562987756360134 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.07062106711260009 + "value": 0.029862256695696095 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -45638,25 +49021,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.633744210871 + "value": 85.37498272636932 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.94970249697312 + "value": 98.10492132019559 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -45668,7 +49051,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -45680,25 +49063,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2091909120.0 + "value": 3769630720.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 2453667840.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1830813696.0 + "value": 144703488.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -45710,13 +49093,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 485130240.0 + "value": 379617280.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 36.58218856575422 + "value": 10.20188952293199 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -45728,13 +49111,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.23490943723587 + "value": 20.77976003982443 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.4268453726161225 + "value": 11.886469182936679 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -45746,7 +49129,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.17327180009133 + "value": 29.386780007938107 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -45765,30 +49148,47 @@ "time" ], "times": { - "compilation": 14155.711, - "data": 61969.538, - "framework": 1734272.6430000002, - "kernel_overhead": 795925.438, - "profiling_overhead": 51523.251, - "profiling_runs": 824854.416, + "compilation_time": 14410.33, + "data": 58612.282, + "framework": 888043.285, + "kernel_overhead": 363576.998, + "profiling_overhead": 48412.581, + "profiling_runs": 417441.424, "runtimes": [ - 3511.808 + 8238.911 ], - "search_algorithm": 25.246, - "validation": 14.978 + "search_algorithm": 25.961, + "validation": 16.454 }, - "timestamp": "2026-01-27 09:27:18 UTC" + "timestamp": "2026-03-13 09:41:57 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "16" + "INNER_UNROLL_FACTOR": 1, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -45796,61 +49196,61 @@ { "name": "time", "unit": "", - "value": 3504.032 + "value": 8762.336 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 29.687016973549476 + "value": 6.159756287228185 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2097400.0 + "value": 6560.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1836068.0 + "value": 2198684.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 3.5457878391203854 + "value": 61.65353095553267 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2153032.0 + "value": 47820018.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2099161.0 + "value": 138416010.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.426831968088507 + "value": 1.2157275934303793 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 8912896.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.070857541167205 + "value": 0.014197771490611412 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -45868,25 +49268,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 98.45264236598801 + "value": 75.6739226901639 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95114272201135 + "value": 99.67702164053263 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -45898,7 +49298,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4647288832.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -45910,43 +49310,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 2091909120.0 + "value": 3528982528.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 1073741824.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 301989888.0 + "value": 2319450112.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1830813696.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 3145728.0 + "value": 1572864.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 485130240.0 + "value": 359219200.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 36.703666069691806 + "value": 8.476945889420886 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -45958,13 +49358,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 48.39572627133095 + "value": 19.447501897768245 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 3.4382705920305927 + "value": 10.50953258073975 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -45976,7 +49376,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.4639028909865 + "value": 26.024906023847738 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -45995,30 +49395,47 @@ "time" ], "times": { - "compilation": 14244.202, - "data": 61499.368, - "framework": 1733942.133, - "kernel_overhead": 796128.699, - "profiling_overhead": 51319.635, - "profiling_runs": 824994.431, + "compilation_time": 14417.675, + "data": 59053.122, + "framework": 865367.642, + "kernel_overhead": 350157.489, + "profiling_overhead": 48769.374, + "profiling_runs": 407387.657, "runtimes": [ - 3504.032 + 8762.336 ], - "search_algorithm": 24.241, - "validation": 15.581 + "search_algorithm": 24.558, + "validation": 14.715 }, - "timestamp": "2026-01-27 09:27:19 UTC" + "timestamp": "2026-03-13 09:41:58 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 64 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 26 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 4 }, "correctness": 1, "invalidity": "correct", @@ -46026,61 +49443,61 @@ { "name": "time", "unit": "", - "value": 6954.176 + "value": 4169.6 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.185445765774567 + "value": 12.058365864691888 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110776.0 + "value": 2828.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1869220.0 + "value": 1839644.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8193543674823522 + "value": 1.5739694087409546 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2213950.0 + "value": 72368.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2101990.0 + "value": 2104321.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.7297758285761606 + "value": 20.925691949916235 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 33554432.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.018016545487510742 + "value": 0.24513472260631197 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -46110,13 +49527,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.93541509668026 + "value": 98.15840342597365 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97678461888452 + "value": 99.95796721512602 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -46128,7 +49545,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 5637144576.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -46140,43 +49557,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 6516375552.0 + "value": 5221908480.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 1090519040.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1255669760.0 + "value": 2193620992.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 12582912.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 953466880.0 + "value": 535298048.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 34.9016389573897 + "value": 39.66659858455398 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -46188,13 +49605,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.604302088115226 + "value": 41.85391835894239 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.970115914851223 + "value": 10.74959035976743 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -46206,7 +49623,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.39346363771948 + "value": 83.46333110590963 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -46225,30 +49642,47 @@ "time" ], "times": { - "compilation": 15046.908, - "data": 61896.136, - "framework": 3636877.205, - "kernel_overhead": 1738845.687, - "profiling_overhead": 51624.341, - "profiling_runs": 1784511.041, + "compilation_time": 14322.111, + "data": 57321.319, + "framework": 1369255.454, + "kernel_overhead": 616324.009, + "profiling_overhead": 47086.806, + "profiling_runs": 648523.32, "runtimes": [ - 6954.176 + 4169.6 ], - "search_algorithm": 24.83, - "validation": 17.298 + "search_algorithm": 26.303, + "validation": 14.986 }, - "timestamp": "2026-01-27 09:27:21 UTC" + "timestamp": "2026-03-13 09:41:59 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 30 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -46256,61 +49690,61 @@ { "name": "time", "unit": "", - "value": 6867.616 + "value": 4308.064 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.255985666145733 + "value": 11.291026194023997 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2120396.0 + "value": 5532.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1878088.0 + "value": 1837308.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8275188455635245 + "value": 1.4728514575497045 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2229081.0 + "value": 74173.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2106476.0 + "value": 2100786.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.7299936233645345 + "value": 9.820071068959301 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.0180165269965517 + "value": 0.11504682414168847 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -46340,13 +49774,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.91095176667726 + "value": 98.7808463662608 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9618443629237 + "value": 99.96311717300892 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -46358,7 +49792,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -46370,43 +49804,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 6516375552.0 + "value": 5863636992.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1255669760.0 + "value": 1373634560.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 953466880.0 + "value": 621477888.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 34.906755162318284 + "value": 43.27406630156071 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -46418,13 +49852,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.607954180981118 + "value": 39.28380495151211 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.9702599121651491 + "value": 5.1214726181903005 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -46436,7 +49870,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.40643572056638 + "value": 90.94984038466983 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -46455,30 +49889,47 @@ "time" ], "times": { - "compilation": 14643.236, - "data": 60045.821, - "framework": 3633731.1059999997, - "kernel_overhead": 1738969.548, - "profiling_overhead": 50092.537, - "profiling_runs": 1784623.2, + "compilation_time": 14004.383, + "data": 57817.952, + "framework": 1874202.534, + "kernel_overhead": 867689.005, + "profiling_overhead": 47731.542, + "profiling_runs": 900964.035, "runtimes": [ - 6867.616 + 4308.064 ], - "search_algorithm": 28.51, - "validation": 15.06 + "search_algorithm": 26.904, + "validation": 15.222 }, - "timestamp": "2026-01-27 09:27:22 UTC" + "timestamp": "2026-03-13 09:42:0 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -46486,61 +49937,61 @@ { "name": "time", "unit": "", - "value": 6727.168 + "value": 6947.456 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.286286756302855 + "value": 6.992910675973478 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2098036.0 + "value": 5684.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839868.0 + "value": 1838332.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8388108469615132 + "value": 0.9289182236994363 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2212507.0 + "value": 115747.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2103441.0 + "value": 2100857.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.7302822342407522 + "value": 3.0389093450763913 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01802073123223261 + "value": 0.03560326351659298 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -46570,13 +50021,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.95285947958709 + "value": 98.55755298323754 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97745868738565 + "value": 99.96194263783444 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -46588,7 +50039,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -46600,43 +50051,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 6516375552.0 + "value": 8738832384.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1255669760.0 + "value": 1157627904.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 953466880.0 + "value": 970653696.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 34.90960636455955 + "value": 36.453718280763894 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -46648,13 +50099,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.60985242617757 + "value": 24.31441465217009 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.9703347575262885 + "value": 1.63243750716474 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -46666,7 +50117,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.41317820901241 + "value": 87.92045809118143 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -46685,30 +50136,47 @@ "time" ], "times": { - "compilation": 13970.205, - "data": 63809.579, - "framework": 3642059.632, - "kernel_overhead": 1739821.306, - "profiling_overhead": 53662.601, - "profiling_runs": 1784766.146, + "compilation_time": 14688.476, + "data": 58110.86, + "framework": 3105753.9129999997, + "kernel_overhead": 1477048.88, + "profiling_overhead": 47922.855, + "profiling_runs": 1522671.318, "runtimes": [ - 6727.168 + 6947.456 ], - "search_algorithm": 24.276, - "validation": 15.563 + "search_algorithm": 26.463, + "validation": 15.887 }, - "timestamp": "2026-01-27 09:27:24 UTC" + "timestamp": "2026-03-13 09:42:1 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 128, + "registers": 32 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 2, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -46716,61 +50184,61 @@ { "name": "time", "unit": "", - "value": 6826.432 + "value": 9408.8 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.218508743315242 + "value": 6.034830869377944 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103644.0 + "value": 29292.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837604.0 + "value": 2194492.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.8271114867841498 + "value": 79.05656990975108 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2212581.0 + "value": 118147061.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100454.0 + "value": 138423319.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.7302724505348734 + "value": 1.1260423660854393 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01802015670673189 + "value": 0.013125143716478438 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -46788,25 +50256,25 @@ "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum", "type": "Double", "unit": "", - "value": 0.0 + "value": 136314880.0 }, { "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.93048812410923 + "value": 94.08860375161854 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.98034792144796 + "value": 98.42772857457008 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -46818,7 +50286,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -46830,25 +50298,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 6516375552.0 + "value": 1918369792.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 536870912.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 1245708288.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1255669760.0 + "value": 72351744.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -46860,13 +50328,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 953466880.0 + "value": 258555904.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 34.90735049434808 + "value": 7.934046371401735 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -46878,13 +50346,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.608356677175845 + "value": 18.206451048316122 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.9702757820712643 + "value": 5.287249395012702 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -46896,7 +50364,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.40786537040695 + "value": 17.53669244428573 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -46915,30 +50383,47 @@ "time" ], "times": { - "compilation": 14057.589, - "data": 60965.751, - "framework": 3635528.9189999998, - "kernel_overhead": 1739340.226, - "profiling_overhead": 50731.041, - "profiling_runs": 1784491.901, + "compilation_time": 14140.094, + "data": 59115.35, + "framework": 546988.5319999999, + "kernel_overhead": 189143.778, + "profiling_overhead": 49029.821, + "profiling_runs": 249699.583, "runtimes": [ - 6826.432 + 9408.8 ], - "search_algorithm": 25.921, - "validation": 16.825 + "search_algorithm": 26.147, + "validation": 15.918 }, - "timestamp": "2026-01-27 09:27:26 UTC" + "timestamp": "2026-03-13 09:42:2 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 32 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 31 + }, "configuration": { - "INNER_UNROLL_FACTOR": "8", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 8 }, "correctness": 1, "invalidity": "correct", @@ -46946,61 +50431,61 @@ { "name": "time", "unit": "", - "value": 6826.304 + "value": 2921.344 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 15.270860952541131 + "value": 16.89516365556077 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2111512.0 + "value": 6424.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839932.0 + "value": 1836960.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.832920165123025 + "value": 2.1811783595689165 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2220540.0 + "value": 52680.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102519.0 + "value": 2100895.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.7316805758745704 + "value": 14.86593618208509 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 16777216.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.018035472110185462 + "value": 0.1741458097014862 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -47030,13 +50515,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.91133897550851 + "value": 98.59520411907219 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97206275942136 + "value": 99.95840777188033 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -47048,7 +50533,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4966055936.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -47060,43 +50545,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 6516375552.0 + "value": 3042967552.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 134217728.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 553648128.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1255669760.0 + "value": 1644167168.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 6291456.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 953466880.0 + "value": 399179776.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 34.940146305774924 + "value": 45.96481466681637 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -47108,13 +50593,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.63131256348842 + "value": 59.46650315508768 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 0.9711809030769971 + "value": 7.752713057816606 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -47126,7 +50611,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 87.48941639208438 + "value": 88.43110194536203 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -47145,30 +50630,47 @@ "time" ], "times": { - "compilation": 14609.396, - "data": 60711.413, - "framework": 3629098.42, - "kernel_overhead": 1736314.973, - "profiling_overhead": 50565.149, - "profiling_runs": 1781506.885, + "compilation_time": 14451.748, + "data": 57410.566, + "framework": 1145370.005, + "kernel_overhead": 507356.419, + "profiling_overhead": 47246.96, + "profiling_runs": 533356.06, "runtimes": [ - 6826.304 + 2921.344 ], - "search_algorithm": 27.523, - "validation": 14.944 + "search_algorithm": 25.234, + "validation": 13.122 }, - "timestamp": "2026-01-27 09:27:28 UTC" + "timestamp": "2026-03-13 09:42:2 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "16", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -47176,61 +50678,61 @@ { "name": "time", "unit": "", - "value": 6314.176 + "value": 3611.36 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.620917115643643 + "value": 13.573401435541378 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110628.0 + "value": 10524.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1869712.0 + "value": 1841192.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9741191754067677 + "value": 1.7962005635465745 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2206861.0 + "value": 67629.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102352.0 + "value": 2105063.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.8713363237882605 + "value": 6.017813733210885 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019487285212468185 + "value": 0.07047766455097934 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -47260,13 +50762,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.97677782338405 + "value": 97.91542773767054 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.95562025722963 + "value": 99.92697183606303 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -47278,7 +50780,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -47290,43 +50792,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5912395776.0 + "value": 3301965824.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 101187584.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1087897600.0 + "value": 1429209088.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 914653184.0 + "value": 513540096.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.798576597845894 + "value": 42.08823580657388 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -47338,13 +50840,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.618453277850723 + "value": 48.14791387094246 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.0495312998957256 + "value": 3.2325869908469667 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -47356,7 +50858,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.80427366997401 + "value": 92.11188902210881 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -47375,30 +50877,47 @@ "time" ], "times": { - "compilation": 14541.055, - "data": 60908.078, - "framework": 3558445.183, - "kernel_overhead": 1701914.243, - "profiling_overhead": 50704.265, - "profiling_runs": 1744918.597, + "compilation_time": 14000.68, + "data": 57196.096, + "framework": 1828222.161, + "kernel_overhead": 847292.162, + "profiling_overhead": 47079.024, + "profiling_runs": 876654.879, "runtimes": [ - 6314.176 + 3611.36 ], - "search_algorithm": 36.201, - "validation": 16.132 + "search_algorithm": 38.218, + "validation": 16.724 }, - "timestamp": "2026-01-27 09:27:30 UTC" + "timestamp": "2026-03-13 09:42:3 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 56 + }, "configuration": { - "INNER_UNROLL_FACTOR": "16", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "16", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 4, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -47406,61 +50925,61 @@ { "name": "time", "unit": "", - "value": 6357.952 + "value": 5676.8 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.588712747154464 + "value": 8.569879143564238 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2114356.0 + "value": 6608.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1874112.0 + "value": 1837268.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.984095168911022 + "value": 1.1291042753164888 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2213229.0 + "value": 97320.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2109054.0 + "value": 2100996.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.871137307941424 + "value": 1.8624342616460055 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01948806218214341 + "value": 0.021817727798704188 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -47490,13 +51009,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.92991810296571 + "value": 65.5400353014946 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.97258321622998 + "value": 99.94063944106135 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -47508,7 +51027,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -47520,25 +51039,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5912395776.0 + "value": 4806148096.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 101187584.0 + "value": 268435456.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1087897600.0 + "value": 1222115328.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -47550,13 +51069,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 914653184.0 + "value": 868564992.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.793328185657344 + "value": 42.342481552246475 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -47568,13 +51087,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.61499787577113 + "value": 29.806164126787955 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.0493950578459565 + "value": 1.0587883008905388 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -47586,7 +51105,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.79248417730727 + "value": 96.44304233478216 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -47605,30 +51124,47 @@ "time" ], "times": { - "compilation": 15085.103, - "data": 59288.787, - "framework": 3552151.649, - "kernel_overhead": 1700497.713, - "profiling_overhead": 49240.643, - "profiling_runs": 1743124.506, + "compilation_time": 14567.808, + "data": 58130.116, + "framework": 3292156.156, + "kernel_overhead": 1573616.662, + "profiling_overhead": 47530.776, + "profiling_runs": 1612878.602, "runtimes": [ - 6357.952 + 5676.8 ], - "search_algorithm": 27.768, - "validation": 14.187 + "search_algorithm": 27.123, + "validation": 15.216 }, - "timestamp": "2026-01-27 09:27:32 UTC" + "timestamp": "2026-03-13 09:42:5 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 16 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 38 + }, "configuration": { - "INNER_UNROLL_FACTOR": "16", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "2", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 16 }, "correctness": 1, "invalidity": "correct", @@ -47636,61 +51172,61 @@ { "name": "time", "unit": "", - "value": 6391.488 + "value": 3676.288 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.5061806344139 + "value": 14.332805645433433 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2103472.0 + "value": 9048.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1840940.0 + "value": 1839384.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.987970117095943 + "value": 1.856549209899996 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2210561.0 + "value": 64755.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2108914.0 + "value": 2101279.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.8716866838302628 + "value": 6.25191366202367 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 8388608.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.01949132835750345 + "value": 0.07322852143641607 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -47720,13 +51256,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.99590198551324 + "value": 98.24297297292934 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.96648609402853 + "value": 99.93176150499153 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -47738,7 +51274,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4630511616.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -47750,43 +51286,43 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5912395776.0 + "value": 2563768320.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 101187584.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 285212672.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1087897600.0 + "value": 1362100224.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", "type": "Double", "unit": "", - "value": 1572864.0 + "value": 3145728.0 }, { "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 914653184.0 + "value": 484179968.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.80226677769365 + "value": 42.94624167489795 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -47798,13 +51334,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.621082081862212 + "value": 50.02480680922421 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.049634950249206 + "value": 3.35859908997477 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -47816,7 +51352,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.81324197760276 + "value": 90.23111414031398 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -47835,30 +51371,47 @@ "time" ], "times": { - "compilation": 16894.381, - "data": 62747.149, - "framework": 3560825.3099999996, - "kernel_overhead": 1701381.413, - "profiling_overhead": 52448.519, - "profiling_runs": 1744248.229, + "compilation_time": 14365.379, + "data": 58313.609, + "framework": 1703970.994, + "kernel_overhead": 784748.147, + "profiling_overhead": 47629.487, + "profiling_runs": 813279.751, "runtimes": [ - 6391.488 + 3676.288 ], - "search_algorithm": 30.75, - "validation": 17.817 + "search_algorithm": 24.754, + "validation": 11.979 }, - "timestamp": "2026-01-27 09:27:34 UTC" + "timestamp": "2026-03-13 09:42:6 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "16", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "4", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 8, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -47866,61 +51419,61 @@ { "name": "time", "unit": "", - "value": 6283.904 + "value": 6354.176 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.464143266536922 + "value": 7.621236147855375 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2104164.0 + "value": 308.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1837452.0 + "value": 1835988.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9732914896705638 + "value": 1.0114844084563546 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2205340.0 + "value": 101498.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2100784.0 + "value": 2099181.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.87170860796226 + "value": 1.663043900490872 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019493072249688995 + "value": 0.019483381522251438 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -47950,13 +51503,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.95858398375442 + "value": 81.88845506242292 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9738577405486 + "value": 99.95783396994162 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -47968,7 +51521,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -47980,25 +51533,25 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5912395776.0 + "value": 7219970048.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", "type": "Double", "unit": "", - "value": 101187584.0 + "value": 134217728.0 }, { "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1087897600.0 + "value": 551026688.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -48010,13 +51563,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 914653184.0 + "value": 952385536.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.802973051944555 + "value": 43.31035234973737 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -48028,13 +51581,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.62150077337741 + "value": 26.612531686490854 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.0496514587159305 + "value": 0.9453426172813523 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -48046,7 +51599,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.81466927995243 + "value": 94.4192766724688 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -48065,30 +51618,47 @@ "time" ], "times": { - "compilation": 17798.441, - "data": 61568.878, - "framework": 3554976.7630000003, - "kernel_overhead": 1699896.458, - "profiling_overhead": 50687.585, - "profiling_runs": 1742823.842, + "compilation_time": 14526.573, + "data": 58508.587, + "framework": 3665366.315, + "kernel_overhead": 1757915.241, + "profiling_overhead": 48496.154, + "profiling_runs": 1800446.333, "runtimes": [ - 6283.904 + 6354.176 ], - "search_algorithm": 28.643, - "validation": 17.18 + "search_algorithm": 35.71, + "validation": 14.574 }, - "timestamp": "2026-01-27 09:27:35 UTC" + "timestamp": "2026-03-13 09:42:8 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8, + "y": 32, + "z": 8 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 8, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 48 + }, "configuration": { - "INNER_UNROLL_FACTOR": "16", - "USE_CONSTANT_MEMORY": "0", - "USE_SOA": "1", - "VECTOR_SIZE": "1", - "WORK_GROUP_SIZE_X": "32", - "WORK_GROUP_SIZE_Y": "8", - "WORK_GROUP_SIZE_Z": "1", - "Z_ITERATIONS": "32" + "INNER_UNROLL_FACTOR": 16, + "USE_CONSTANT_MEMORY": 0, + "USE_SOA": 1, + "VECTOR_SIZE": 1, + "WORK_GROUP_SIZE_X": 32, + "WORK_GROUP_SIZE_Y": 8, + "WORK_GROUP_SIZE_Z": 1, + "Z_ITERATIONS": 32 }, "correctness": 1, "invalidity": "correct", @@ -48096,61 +51666,61 @@ { "name": "time", "unit": "", - "value": 6304.32 + "value": 6112.544 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 16.52712655712771 + "value": 8.027909765336084 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2110932.0 + "value": 508.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 1839120.0 + "value": 1833380.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.9804851011472249 + "value": 1.0621239917350704 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 2212214.0 + "value": 97334.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 2102329.0 + "value": 2099183.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.8748695419607588 + "value": 1.743817551532071 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", "type": "Double", "unit": "", - "value": 4718592.0 + "value": 4194304.0 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 0.019525957176308874 + "value": 0.02042898338076932 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -48180,13 +51750,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 81.87262847716042 + "value": 81.84946052250713 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 99.9761339149827 + "value": 99.95744980086064 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -48198,7 +51768,7 @@ "name": "smsp__sass_thread_inst_executed_op_fp32_pred_on.sum", "type": "Double", "unit": "", - "value": 4479516672.0 + "value": 4462739456.0 }, { "name": "smsp__sass_thread_inst_executed_op_fp64_pred_on.sum", @@ -48210,7 +51780,7 @@ "name": "smsp__sass_thread_inst_executed_op_integer_pred_on.sum", "type": "Double", "unit": "", - "value": 5912395776.0 + "value": 5944901632.0 }, { "name": "smsp__sass_thread_inst_executed_op_control_pred_on.sum", @@ -48222,13 +51792,13 @@ "name": "smsp__sass_thread_inst_executed_op_memory_pred_on.sum", "type": "Double", "unit": "", - "value": 167772160.0 + "value": 150994944.0 }, { "name": "smsp__sass_thread_inst_executed_op_misc_pred_on.sum", "type": "Double", "unit": "", - "value": 1087897600.0 + "value": 1054343168.0 }, { "name": "smsp__sass_thread_inst_executed_op_conversion_pred_on.sum", @@ -48240,13 +51810,13 @@ "name": "smsp__inst_executed.sum", "type": "Double", "unit": "", - "value": 914653184.0 + "value": 913571840.0 }, { "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 38.86762933366204 + "value": 44.104512797632204 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -48258,13 +51828,13 @@ "name": "smsp__inst_executed_pipe_xu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 26.6658042815741 + "value": 27.904245287815343 }, { "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 1.0513982889341351 + "value": 0.991227463226839 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -48276,7 +51846,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 90.96581936652514 + "value": 95.078144197065 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -48295,19 +51865,19 @@ "time" ], "times": { - "compilation": 17138.247, - "data": 64072.067, - "framework": 3560020.715, - "kernel_overhead": 1699643.6, - "profiling_overhead": 53929.429, - "profiling_runs": 1742375.619, + "compilation_time": 17598.645, + "data": 58050.272, + "framework": 3573123.104, + "kernel_overhead": 1712983.429, + "profiling_overhead": 48052.906, + "profiling_runs": 1754036.497, "runtimes": [ - 6304.32 + 6112.544 ], - "search_algorithm": 14.401, - "validation": 19.035 + "search_algorithm": 13.239, + "validation": 22.74 }, - "timestamp": "2026-01-27 09:27:37 UTC" + "timestamp": "2026-03-13 09:42:10 UTC" } ], "schema_version": "1.0.0" diff --git a/Source/Output/JsonT4Converters.cpp b/Source/Output/JsonT4Converters.cpp index e39d95a8..f77c6a94 100644 --- a/Source/Output/JsonT4Converters.cpp +++ b/Source/Output/JsonT4Converters.cpp @@ -11,30 +11,27 @@ void to_json(json& j, const as_T4& configuration) j = json::object(); const std::vector& pairs = configuration.v.GetPairs(); for (const auto& pair : pairs) { - std::string value; switch (pair.GetValueType()) { case ParameterValueType::Int: - value = std::to_string(std::get(pair.GetValue())); + j[pair.GetName()] = std::get(pair.GetValue()); break; case ParameterValueType::UnsignedInt: - value = std::to_string(pair.GetValueUint()); + j[pair.GetName()] = pair.GetValueUint(); break; case ParameterValueType::Double: - value = std::to_string(std::get(pair.GetValue())); + j[pair.GetName()] = std::get(pair.GetValue()); break; case ParameterValueType::Bool: - value = std::to_string(std::get(pair.GetValue())); + j[pair.GetName()] = std::get(pair.GetValue()); break; case ParameterValueType::String: - value = pair.GetValueString(); + j[pair.GetName()] = pair.GetValueString(); break; default: KttError("Unhandled parameter value type"); } - - j[pair.GetName()] = value; } } @@ -42,36 +39,34 @@ void from_json(const json& j, as_T4& configuration) { std::vector pairs; for (auto it = j.begin(); it != j.end(); ++it) { - ParameterPair pair; std::string name = it.key(); - std::string valueStr; - - try { - valueStr = it.value().get(); - if (valueStr == "true" || valueStr == "false") { - pair = ParameterPair(name, valueStr == "true"); - } - // detect floating-point numbers (presence of '.' or exponent) - else if (valueStr.find('.') != std::string::npos || - valueStr.find('e') != std::string::npos || - valueStr.find('E') != std::string::npos) { - pair = ParameterPair(name, std::stod(valueStr)); - } - // detect unsigned integers - else if (!valueStr.empty() && valueStr.find_first_not_of("0123456789") == std::string::npos) { - pair = ParameterPair(name, static_cast(std::stoull(valueStr))); - } - // fallback: signed integer - else { - pair = ParameterPair(name, static_cast(std::stoll(valueStr))); - } - } catch (const std::invalid_argument&) { - pair = ParameterPair(name, valueStr); - } catch (const std::out_of_range&) { - pair = ParameterPair(name, valueStr); + const auto &jsonValue = it.value(); + + ParameterPair pair; + + if (jsonValue.is_boolean()) + { + pair = ParameterPair(name, jsonValue.get()); + } + else if (jsonValue.is_number_float()) + { + pair = ParameterPair(name, jsonValue.get()); + } + else if (jsonValue.is_number_unsigned()) + { + pair = ParameterPair(name, jsonValue.get()); + } + else if (jsonValue.is_number_integer()) + { + pair = ParameterPair(name, jsonValue.get()); + } + else if (jsonValue.is_string()) + { + pair = ParameterPair(name, jsonValue.get()); } - catch (const nlohmann::json::type_error& e) { - KttError("JSON type error while parsing"); + else + { + KttError("Unsupported parameter value type in configuration"); } pairs.push_back(pair); } @@ -97,7 +92,7 @@ void to_json(json& j, const as_T4& result) to_json(j_configuration,as_T4(configuration)); j["configuration"] = j_configuration; j["times"] = json::object(); - j["times"]["compilation"] = time.ConvertFromNanosecondsDouble(result.v.GetCompilationOverhead()); + j["times"]["compilation_time"] = time.ConvertFromNanosecondsDouble(result.v.GetCompilationOverhead()); j["times"]["data"] = time.ConvertFromNanosecondsDouble(result.v.GetDataMovementOverhead()); j["times"]["profiling_runs"] = time.ConvertFromNanosecondsDouble(result.v.GetProfilingRunsOverhead()); j["times"]["profiling_overhead"] = time.ConvertFromNanosecondsDouble(result.v.GetProfilingOverhead()); @@ -114,7 +109,7 @@ void to_json(json& j, const as_T4& result) j["measurements"].push_back({{"name","time"}, {"value",time.ConvertFromNanosecondsDouble(result.v.GetTotalDuration())}, {"unit",""}}); const std::vector& compResults = result.v.GetResults(); - if (compResults[0].HasProfilingData()) { + if (!compResults.empty() && compResults[0].HasProfilingData()) { const std::vector& counters = compResults[0].GetProfilingData().GetCounters(); for (const auto& counter : counters) { json j_counter = json::object(); @@ -123,6 +118,29 @@ void to_json(json& j, const as_T4& result) } } + if (!compResults.empty() && compResults[0].HasCompilationData()) { + const KernelCompilationData& compilationData = compResults[0].GetCompilationData(); + const DimensionVector& globalSize = compResults[0].GetGlobalSize(); + const DimensionVector& localSize = compResults[0].GetLocalSize(); + json j_compilationData = json::object(); + j["compilation_data"] = { + {"max_work_group_size", compilationData.m_MaxWorkGroupSize}, + {"local_memory_size", compilationData.m_LocalMemorySize}, + {"private_memory_size", compilationData.m_PrivateMemorySize}, + {"constant_memory_size", compilationData.m_ConstantMemorySize}, + {"registers", compilationData.m_RegistersCount}, + {"global_size", { + {"x", globalSize.GetSizeX()}, + {"y", globalSize.GetSizeY()}, + {"z", globalSize.GetSizeZ()} + }}, + {"local_size", { + {"x", localSize.GetSizeX()}, + {"y", localSize.GetSizeY()}, + {"z", localSize.GetSizeZ()} + }} + }; + } } void from_json(const json& j, as_T4& result) @@ -146,7 +164,7 @@ void from_json(const json& j, as_T4& result) const Nanoseconds durationNs = time.ConvertToNanosecondsDouble(duration); double compilationOverhead; - j.at("times").at("compilation").get_to(compilationOverhead); + j.at("times").at("compilation_time").get_to(compilationOverhead); const Nanoseconds compilationOverheadNs = time.ConvertToNanosecondsDouble(compilationOverhead); double dataMovementOverhead; @@ -195,6 +213,50 @@ void from_json(const json& j, as_T4& result) computationResult.SetProfilingData(std::move(uniqueData)); } + if (j.contains("compilation_data")) + { + const auto& compilationDataJson = j["compilation_data"]; + + if (!compilationDataJson.contains("max_work_group_size") || + !compilationDataJson.contains("local_memory_size") || + !compilationDataJson.contains("private_memory_size") || + !compilationDataJson.contains("constant_memory_size") || + !compilationDataJson.contains("registers") || + !compilationDataJson.contains("global_size") || + !compilationDataJson.contains("local_size")) + { + KttError( + "Missing compilation data fields. Required fields: max_work_group_size, local_memory_size, private_memory_size, constant_memory_size, registers, global_size, local_size"); + } + + // Extract compilation data + KernelCompilationData compData; + compData.m_MaxWorkGroupSize = compilationDataJson["max_work_group_size"]; + compData.m_LocalMemorySize = compilationDataJson["local_memory_size"]; + compData.m_PrivateMemorySize = compilationDataJson["private_memory_size"]; + compData.m_ConstantMemorySize = compilationDataJson["constant_memory_size"]; + compData.m_RegistersCount = compilationDataJson["registers"]; + + // Extract global size + const auto& globalSizeJson = compilationDataJson["global_size"]; + if (!globalSizeJson.contains("x") || !globalSizeJson.contains("y") || !globalSizeJson.contains("z")) + { + KttError("Missing global_size dimensions"); + } + DimensionVector globalSize(globalSizeJson["x"], globalSizeJson["y"], globalSizeJson["z"]); + + // Extract local size + const auto& localSizeJson = compilationDataJson["local_size"]; + if (!localSizeJson.contains("x") || !localSizeJson.contains("y") || !localSizeJson.contains("z")) + { + KttError("Missing local_size dimensions"); + } + DimensionVector localSize(localSizeJson["x"], localSizeJson["y"], localSizeJson["z"]); + + computationResult.SetCompilationData(std::make_unique(compData)); + computationResult.SetSizeData(globalSize, localSize); + } + results.push_back(computationResult); result.v = KernelResult(kernelName, configuration, results, timestamp); diff --git a/Tutorials/03KernelTuning/FullSearchSpace.t4.json b/Tutorials/03KernelTuning/FullSearchSpace.t4.json index 1bf626bb..23c0d7b8 100644 --- a/Tutorials/03KernelTuning/FullSearchSpace.t4.json +++ b/Tutorials/03KernelTuning/FullSearchSpace.t4.json @@ -5,13 +5,30 @@ "compute_api": "CUDA", "device": "NVIDIA RTX 500 Ada Generation Laptop GPU", "platform": "NVIDIA CUDA", - "timestamp": "2026-01-27 09:23:42 UTC", + "timestamp": "2026-03-13 09:44:4 UTC", "timeunit": "microseconds" }, "results": [ { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 32768, + "y": 1, + "z": 1 + }, + "local_memory_size": 0, + "local_size": { + "x": 32, + "y": 1, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 12 + }, "configuration": { - "multiply_block_size": "32" + "multiply_block_size": 32 }, "correctness": 1, "invalidity": "correct", @@ -19,49 +36,49 @@ { "name": "time", "unit": "", - "value": 92.0 + "value": 89.44 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 19.432367149758452 + "value": 19.12216730954677 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 2188.0 + "value": 1696.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 49300.0 + "value": 49068.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 20.098754245965317 + "value": 20.021818505655002 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 263487.0 + "value": 263544.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 132107.0 + "value": 132012.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.360174376851201 + "value": 6.427757366282913 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", @@ -73,7 +90,7 @@ "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 1.4880152289058584 + "value": 1.4854734765374251 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -103,13 +120,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 25.119426339496563 + "value": 25.2548705608268 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 41.87776550681886 + "value": 40.84136332809888 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -169,7 +186,7 @@ "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.993584919572935 + "value": 8.183513926905666 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -187,7 +204,7 @@ "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.106469081611187 + "value": 7.274357932686437 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -199,7 +216,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.384947115707838 + "value": 13.701274699806198 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -218,23 +235,40 @@ "time" ], "times": { - "compilation": 33384.951, - "data": 6377.032, - "framework": 40685.69, - "kernel_overhead": 8855.066, - "profiling_overhead": 4024.685, - "profiling_runs": 21428.907, + "compilation_time": 15061.987, + "data": 6837.891, + "framework": 40386.628, + "kernel_overhead": 8584.186, + "profiling_overhead": 3889.532, + "profiling_runs": 21075.019, "runtimes": [ - 92.0 + 89.44 ], - "search_algorithm": 15.506, - "validation": 11025.241 + "search_algorithm": 12.432, + "validation": 9727.423 }, - "timestamp": "2026-01-27 09:23:41 UTC" + "timestamp": "2026-03-13 09:44:4 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 16384, + "y": 1, + "z": 1 + }, + "local_memory_size": 0, + "local_size": { + "x": 64, + "y": 1, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 12 + }, "configuration": { - "multiply_block_size": "64" + "multiply_block_size": 64 }, "correctness": 1, "invalidity": "correct", @@ -242,49 +276,49 @@ { "name": "time", "unit": "", - "value": 56.256 + "value": 54.336 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 39.84525529865125 + "value": 40.05379593810445 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 3768.0 + "value": 3576.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 49172.0 + "value": 49436.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 39.99961514778325 + "value": 40.105867823656546 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 262719.0 + "value": 262809.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 132098.0 + "value": 131947.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 6.8597363121381 + "value": 6.901737439945743 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", @@ -296,7 +330,7 @@ "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 2.965129326258334 + "value": 2.97997104422667 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -326,13 +360,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 29.58726199497857 + "value": 29.65983910408843 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 81.80781676427367 + "value": 81.35844773773927 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -392,7 +426,7 @@ "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 8.154198577983072 + "value": 8.242345387559466 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -410,7 +444,7 @@ "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 7.249012242441941 + "value": 7.325535644024753 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -422,7 +456,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.696757323564157 + "value": 13.839641011195766 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -441,23 +475,40 @@ "time" ], "times": { - "compilation": 12291.298, - "data": 4883.968, - "framework": 42371.962, - "kernel_overhead": 10481.932, - "profiling_overhead": 3881.842, - "profiling_runs": 23124.22, + "compilation_time": 12324.712, + "data": 4618.838, + "framework": 38430.715, + "kernel_overhead": 8863.657, + "profiling_overhead": 3822.415, + "profiling_runs": 21125.805, "runtimes": [ - 56.256 + 54.336 ], - "search_algorithm": 11.752, - "validation": 10612.768 + "search_algorithm": 8.477, + "validation": 9358.694 }, - "timestamp": "2026-01-27 09:23:41 UTC" + "timestamp": "2026-03-13 09:44:4 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 8192, + "y": 1, + "z": 1 + }, + "local_memory_size": 0, + "local_size": { + "x": 128, + "y": 1, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 12 + }, "configuration": { - "multiply_block_size": "128" + "multiply_block_size": 128 }, "correctness": 1, "invalidity": "correct", @@ -465,49 +516,49 @@ { "name": "time", "unit": "", - "value": 50.272 + "value": 44.256 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 79.78093853820599 + "value": 71.57711330935251 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 15436.0 + "value": 9788.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 46040.0 + "value": 41152.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 69.0787996969782 + "value": 74.65849479972103 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 262449.0 + "value": 262462.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 131341.0 + "value": 131348.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.039914043535358 + "value": 13.352566766908714 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", @@ -519,7 +570,7 @@ "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 5.131545978451516 + "value": 5.5507372072853425 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -549,13 +600,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 60.73727897382838 + "value": 62.16194625776604 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 76.4738239163117 + "value": 82.55620527970513 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -615,7 +666,7 @@ "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 15.095695800750002 + "value": 15.128991154364646 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -633,7 +684,7 @@ "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.42039855118836 + "value": 13.447171387006293 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -645,7 +696,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 25.408111809350725 + "value": 25.45905584999969 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -664,23 +715,40 @@ "time" ], "times": { - "compilation": 13162.126, - "data": 4915.018, - "framework": 39486.164000000004, - "kernel_overhead": 8791.818, - "profiling_overhead": 4019.025, - "profiling_runs": 21760.303, + "compilation_time": 12721.594, + "data": 4697.985, + "framework": 38527.655, + "kernel_overhead": 8903.951, + "profiling_overhead": 3881.076, + "profiling_runs": 21044.643, "runtimes": [ - 50.272 + 44.256 ], - "search_algorithm": 11.35, - "validation": 10382.171 + "search_algorithm": 9.675, + "validation": 9290.56 }, - "timestamp": "2026-01-27 09:23:41 UTC" + "timestamp": "2026-03-13 09:44:4 UTC" }, { + "compilation_data": { + "constant_memory_size": 0, + "global_size": { + "x": 4096, + "y": 1, + "z": 1 + }, + "local_memory_size": 0, + "local_size": { + "x": 256, + "y": 1, + "z": 1 + }, + "max_work_group_size": 1024, + "private_memory_size": 0, + "registers": 12 + }, "configuration": { - "multiply_block_size": "256" + "multiply_block_size": 256 }, "correctness": 1, "invalidity": "correct", @@ -688,49 +756,49 @@ { "name": "time", "unit": "", - "value": 48.64 + "value": 44.576 }, { "name": "dram__throughput.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 77.39057570977917 + "value": 72.94351374570446 }, { "name": "dram__sectors_read.sum", "type": "Double", "unit": "", - "value": 12692.0 + "value": 10632.0 }, { "name": "dram__sectors_write.sum", "type": "Double", "unit": "", - "value": 50112.0 + "value": 43708.0 }, { "name": "lts__t_sectors.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 65.45649428343525 + "value": 71.22926120761916 }, { "name": "lts__t_sectors_op_read.sum", "type": "Double", "unit": "", - "value": 262472.0 + "value": 262455.0 }, { "name": "lts__t_sectors_op_write.sum", "type": "Double", "unit": "", - "value": 131283.0 + "value": 131700.0 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.011280837664735 + "value": 13.16272035445509 }, { "name": "l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum", @@ -742,7 +810,7 @@ "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 4.862413637550749 + "value": 5.2921264115351825 }, { "name": "l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum", @@ -772,13 +840,13 @@ "name": "sm__warps_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 62.664028283494076 + "value": 63.26284267038368 }, { "name": "smsp__cycles_active.avg.pct_of_peak_sustained_elapsed", "type": "Double", "unit": "", - "value": 73.746557373157 + "value": 79.48432937543606 }, { "name": "smsp__sass_thread_inst_executed_op_fp16_pred_on.sum", @@ -838,7 +906,7 @@ "name": "smsp__pipe_fma_cycles_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 14.840054247546686 + "value": 14.978992866565715 }, { "name": "smsp__inst_executed_pipe_fp64.avg.pct_of_peak_sustained_active", @@ -856,7 +924,7 @@ "name": "smsp__inst_executed_pipe_lsu.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 13.186822031425748 + "value": 13.316150373587144 }, { "name": "smsp__inst_executed_pipe_tex.avg.pct_of_peak_sustained_active", @@ -868,7 +936,7 @@ "name": "smsp__issue_active.avg.pct_of_peak_sustained_active", "type": "Double", "unit": "", - "value": 24.98093488483463 + "value": 25.225932723328103 }, { "name": "smsp__thread_inst_executed_per_inst_executed.ratio", @@ -887,19 +955,19 @@ "time" ], "times": { - "compilation": 12729.014, - "data": 4813.984, - "framework": 38633.167, - "kernel_overhead": 8850.544, - "profiling_overhead": 3927.906, - "profiling_runs": 21040.733, + "compilation_time": 12673.4, + "data": 4649.726, + "framework": 37830.429, + "kernel_overhead": 8500.152, + "profiling_overhead": 3821.577, + "profiling_runs": 20858.974, "runtimes": [ - 48.64 + 44.576 ], - "search_algorithm": 4.332, - "validation": 9586.204 + "search_algorithm": 3.471, + "validation": 9406.798 }, - "timestamp": "2026-01-27 09:23:42 UTC" + "timestamp": "2026-03-13 09:44:4 UTC" } ], "schema_version": "1.0.0"